def classification_predict(table, model, prediction_col='prediction', prob_prefix='probability', output_log_prob=False, log_prob_prefix='log_probability', thresholds=None, suffix='index'): if '_grouped_data' in model: tmp_model = model['_grouped_data']['data'] tmp_model = list(tmp_model.values())[0] else: tmp_model = model if 'logistic_regression_model' in tmp_model['_type'] or 'one_vs' in tmp_model['_type']: return logistic_regression_predict(table=table, model=model, prediction_col=prediction_col, prob_prefix=prob_prefix, output_log_prob=output_log_prob, log_prob_prefix=log_prob_prefix, thresholds=thresholds, suffix=suffix) if tmp_model['_type'] == 'svm_model': return svm_classification_predict(table=table, model=model, prediction_col=prediction_col, prob_prefix=prob_prefix, display_log_prob=output_log_prob, log_prob_prefix=log_prob_prefix, thresholds=thresholds, suffix=suffix) if tmp_model['_type'] == 'decision_tree_model': if 'method' in tmp_model and tmp_model['method'] == 'classification': return decision_tree_classification_predict(table=table, model=model, prediction_col=prediction_col) if 'tree_classification' in tmp_model['_type']: return decision_tree_classification_predict(table=table, model=model, prediction_col=prediction_col) if tmp_model['_type'] == 'random_forest_model': if 'method' in tmp_model and tmp_model['method'] == 'classification': return random_forest_classification_predict(table=table, model=model, pred_col_name=prediction_col) if 'forest_classification' in tmp_model['_type'] or 'gbt_classification' in tmp_model['_type']: return random_forest_classification_predict(table=table, model=model, pred_col_name=prediction_col) if tmp_model['_type'] == 'naive_bayes_model': return naive_bayes_predict(table=table, model=model, prediction_col=prediction_col, prob_prefix=prob_prefix, display_log_prob=output_log_prob, log_prob_prefix=log_prob_prefix, suffix=suffix) raise_runtime_error('''It is not supported yet.''')
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=1e-3, max_iter=-1, random_state=None): validate(greater_than(c, 0.0, 'c')) _table = table.copy() _feature_cols = _table[feature_cols] _label_col = _table[label_col] if(sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'): raise_runtime_error('''Label Column should not be continuous.''') _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, max_iter=max_iter, random_state=random_state) _svc_model = _svc.fit(_feature_cols, _label_col) get_param = _svc.get_params() get_param['feature_cols'] = feature_cols get_param['label_col'] = label_col rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## SVM Classification Result | ### Parameters | {table_parameter} """.format(table_parameter=dict2MD(get_param)))) _model = _model_dict('svc_model') _model['svc_model'] = _svc_model _model['features'] = feature_cols _model['_repr_brtc_'] = rb.get() return {'model':_model}
def _outlier_detection_tukey_carling_model(table, model, new_column_prefix='is_outlier_'): out_table = table.copy() input_cols = model['input_cols'] outlier_method = model['outlier_method'] result_type = model['result_type'] output_col_names = [] if outlier_method == 'tukey': for col in input_cols: output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col) output_col_names.append(output_col_name) out_table[output_col_name] = out_table[col].apply(lambda _: _tukey(_, model['q1'][col], model['q3'][col], model['iqr'][col], model['multiplier'])) elif outlier_method == 'carling': for col in input_cols: output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col) output_col_names.append(output_col_name) out_table[output_col_name] = out_table[col].apply(lambda _: _carling(_, model['median'][col], model['iqr'][col], model['multiplier'])) else: raise_runtime_error("Please check 'outlier_method'.") # result_type is one of 'add_prediction', 'remove_outliers', 'both' if result_type == 'add_prediction': pass elif result_type == 'remove_outliers': prediction = out_table[output_col_names].apply(lambda row: np.sum(row == 'out') < model['number_of_removal'], axis=1) out_table = out_table[prediction.values] out_table = out_table.drop(output_col_names, axis=1) elif result_type == 'both': prediction = out_table[output_col_names].apply(lambda row: np.sum(row == 'out') < model['number_of_removal'], axis=1) out_table = out_table[prediction.values] else: raise_runtime_error("Please check 'result_type'.") return {'out_table':out_table}
def _doc_doc_mtx(table, model, input_col, result_type='sparse'): corpus = table[input_col].tolist() dictionary = model['dictionary'] bow_corpus = [] for doc in corpus: bow_corpus.append(dictionary.doc2bow(doc)) csr_matrix = matutils.corpus2csc(bow_corpus).T csr_matrix.data = np.array([1 for _ in range(len(csr_matrix.data))]) doc_doc = (csr_matrix @ (csr_matrix.T)).tocoo() if result_type == 'sparse': doc_doc = sparse.triu(doc_doc, k=1) out_table = pd.DataFrame(doc_doc.row, columns=['1st_document_idx']) out_table['2nd_document_idx'] = doc_doc.col out_table['number_of_common_terms'] = doc_doc.data elif result_type == 'dense': doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))] out_table = pd.DataFrame(doc_doc.todense()) out_table.insert(loc=0, column=' ', value=doc_idx) out_table.columns = np.append("", doc_idx) else: raise_runtime_error("Please check 'result_type'.") rb = BrtcReprBuilder() model = _model_dict('doc_doc_mtx') model['input_col'] = input_col model['doc_doc_mtx'] = doc_doc model['_repr_brtc_'] = rb.get() return {'out_table': out_table}
def join(left_table, right_table, left_on, right_on, how='inner', lsuffix='_left', rsuffix='_right', sort=False): if sort == True or sort == 'True' or sort == 'true': sort = True else: sort = False both_on = list(set(left_on) & set(right_on)) if len(both_on) > 0 and how in ['outer', 'left', 'right']: left_table = left_table.rename(columns={key:key + lsuffix for key in both_on}) right_table = right_table.rename(columns={key:key + rsuffix for key in both_on}) left_on = [col_name + lsuffix if col_name in both_on else col_name for col_name in left_on] right_on = [col_name + rsuffix if col_name in both_on else col_name for col_name in right_on] if how == 'left_exclude': table = pd.merge(left_table, right_table, how='outer', left_on=left_on, right_on=right_on, suffixes=(lsuffix, rsuffix), sort=sort, indicator=True) left_columns = left_table.columns col_names_left = list(set(left_columns) & set(right_table.columns) - set(right_on)) col_names = [col_name + lsuffix if col_name in col_names_left else col_name for col_name in left_columns] table = table[table['_merge'] == 'left_only'][col_names] table = table.rename(columns={col_name + lsuffix:col_name for col_name in col_names_left}) if table.empty: raise_runtime_error("The result is empty.") elif how == 'right_exclude': table = pd.merge(left_table, right_table, how='outer', left_on=left_on, right_on=right_on, suffixes=(lsuffix, rsuffix), sort=sort, indicator=True) right_columns = right_table.columns col_names_right = list(set(left_table.columns) & set(right_columns) - set(right_on)) col_names = [col_name + rsuffix if col_name in col_names_right else col_name for col_name in right_columns] table = table[table['_merge'] == 'right_only'][col_names] table = table.rename(columns={col_name + rsuffix:col_name for col_name in col_names_right}) if table.empty: raise_runtime_error("The result is empty.") else: table = pd.merge(left_table, right_table, how=how, left_on=left_on, right_on=right_on, suffixes=(lsuffix, rsuffix), sort=sort) return {'table' : table}
def read_from_db(datasource, sql): if sql is None: raise_runtime_error('sql is required parameter') with DbEngine(**datasource) as engine: df = pd.read_sql_query(sql, engine) util.validate_column_name(df) return {'table': df}
def process_text(text): if method == METHOD_NLTK: return process_text_nltk(text, ne_extraction_nltk) elif method == METHOD_SPACY: return process_text_spacy(text, ne_extraction_spacy) elif method == METHOD_STANFORD: return process_text_stanford(text) else: raise_runtime_error("Invalid method name.")
def _representative_evaluation_value(table, input_col): col_name = input_col col_value =table.ix[0][col_name] if table.shape[0] > 1: raise_runtime_error("Only one column with one row is allowed") raw_data = "{'accuracy_index': " + col_name + ",'accuracy_value': " + str(col_value) + " }" print(raw_data)
def _outlier_detection_lof(table, input_cols, n_neighbors=20, result_type='add_prediction', new_column_name='is_outlier'): out_table = table.copy() features = out_table[input_cols] lof_model = LocalOutlierFactor(n_neighbors, algorithm='auto', leaf_size=30, metric='minkowski', p=2, novelty=True, contamination=0.1) lof_model.fit(features) isinlier = lambda _: 'in' if _ == 1 else 'out' out_table[new_column_name] = [ isinlier(lof_predict) for lof_predict in lof_model.predict(features) ] if result_type == 'add_prediction': pass elif result_type == 'remove_outliers': out_table = out_table[out_table[new_column_name] == 'in'] out_table = out_table.drop(new_column_name, axis=1) elif result_type == 'both': out_table = out_table[out_table[new_column_name] == 'in'] else: raise_runtime_error("Please check 'result_type'.") params = { 'Input Columns': input_cols, 'Result Type': result_type, 'Number of Neighbors': n_neighbors, } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Outlier Detection (Local Outlier Factor) Result | ### Parameters | | {display_params} | """.format(display_params=dict2MD(params)))) model = _model_dict('outlier_detection_lof') model['params'] = params model['lof_model'] = lof_model model['input_cols'] = input_cols model['result_type'] = result_type model['num_neighbors'] = n_neighbors model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': model}
def clustering_predict(model, num_clusters, cluster_col='cluster'): if '_grouped_data' in model: tmp_model = model['_grouped_data']['data'] tmp_model = list(tmp_model.values())[0] else: tmp_model = model if tmp_model['_type'] == 'hierarchical_clustering': return hierarchical_clustering_post(model=model, num_clusters=num_clusters, cluster_col=cluster_col) raise_runtime_error('''It is not supported yet.''')
def regression_predict(table, model, prediction_col='prediction'): if '_grouped_data' in model: tmp_model = model['_grouped_data']['data'] tmp_model = list(tmp_model.values())[0] else: tmp_model = model if 'linear_regression_model' in tmp_model['_type']: return linear_regression_predict(table=table, model=model, prediction_col=prediction_col) if tmp_model['_type'] == 'decision_tree_model': if 'method' in tmp_model and tmp_model['method'] == 'regression': return decision_tree_regression_predict( table=table, model=model, prediction_col=prediction_col) if 'tree_regression' in tmp_model['_type']: return decision_tree_regression_predict(table=table, model=model, prediction_col=prediction_col) if tmp_model['_type'] == 'random_forest_model': if 'method' in tmp_model and tmp_model['method'] == 'regression': return random_forest_regression_predict( table=table, model=model, prediction_col=prediction_col) if 'forest_regression' in tmp_model[ '_type'] or 'gbt_regression' in tmp_model['_type']: return random_forest_regression_predict(table=table, model=model, prediction_col=prediction_col) if tmp_model['_type'] == 'ada_boost_regression_model': return ada_boost_regression_predict(table=table, model=model, pred_col_name=prediction_col) if tmp_model['_type'] == 'glm_model': return glm_predict(table=table, model=model, prediction_col=prediction_col) if tmp_model['_type'] == 'mlp_regression_model': return mlp_regression_predict(table=table, model=model, prediction_col=prediction_col) if tmp_model['_type'] == 'xgb_regression_model': return xgb_regression_predict(table=table, model=model, prediction_col=prediction_col) if tmp_model['_type'] == 'isotonic_regression_model': return isotonic_regression_predict(table=table, model=model, prediction_col=prediction_col) if tmp_model['_type'] == 'pls_regression_model': return pls_regression_predict(table=table, model=model, prediction_col=prediction_col) raise_runtime_error('''It is not supported yet.''')
def _doc_term_mtx(table, model, input_col, result_type='doc_to_bow_token'): corpus = table[input_col].tolist() dictionary = model['dictionary'] bow_corpus = [] for doc in corpus: bow_corpus.append(dictionary.doc2bow(doc)) doc_to_bow = [] for i in range(len(corpus)): token_cnt = [] for j in range(len(bow_corpus[i])): token_cnt.append('({token}, {cnt})'.format( token=dictionary[bow_corpus[i][j][0]], cnt=bow_corpus[i][j][1])) doc_to_bow.append(token_cnt) doc_to_bow_list = [] for doc in doc_to_bow: doc_to_bow_list.append('{}'.format(list(doc))) doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))] terms = [term for term in dictionary.token2id.keys()] if result_type == 'doc_to_bow_token': out_table = pd.DataFrame(data=doc_to_bow_list, columns=['doc_to_bow']) out_table.insert(loc=0, column='doc_idx', value=doc_idx) elif result_type == 'doc_term_mtx': out_table = pd.DataFrame( matutils.corpus2dense(bow_corpus, num_terms=len(dictionary.token2id)).T) out_table.insert(loc=0, column=' ', value=doc_idx) out_table.columns = np.append('', terms) elif result_type == 'term_doc_mtx': out_table = pd.DataFrame( matutils.corpus2dense(bow_corpus, num_terms=len(dictionary.token2id))) out_table.insert(loc=0, column=' ', value=terms) out_table.columns = np.append('', doc_idx) else: raise_runtime_error("Please check 'result_type'.") rb = BrtcReprBuilder() model = _model_dict('doc_term_mtx') model['bow_corpus'] = bow_corpus model['_repr_brtc_'] = rb.get() return {'out_table': out_table}
def image_load(path, labeling='dir', image_col='image', n_sample=None, size_limit=640, auto_resize_limit=False): def _is_big_image(img, limit): return max(img.height, img.width) > limit if labeling == 'dir': images_file_list = glob.glob('''{}/*/*'''.format(path)) if n_sample is not None: images_file_list = random.sample(images_file_list, n_sample) npy_images = [(cv2.imread(x), x) for x in images_file_list] label = [ os.path.split(os.path.dirname(os.path.abspath(x[1])))[1] for x in npy_images if x[0] is not None ] else: images_file_list = glob.glob('''{}/*'''.format(path)) if n_sample is not None: images_file_list = random.sample(images_file_list, n_sample) npy_images = [(cv2.imread(x), x) for x in images_file_list] label = None loaded_images = [ Image(x[0], origin=x[1]) for x in npy_images if x[0] is not None ] # check the size of loaded images if any([x for x in loaded_images if _is_big_image(x, size_limit)]): if auto_resize_limit: encoded_images = [ x.resize_limit(size_limit).tobytes() for x in loaded_images ] else: raise_runtime_error( 'Cannot load images with size over {}px.'.format(size_limit)) else: encoded_images = [x.tobytes() for x in loaded_images] label_col = '{}_label'.format(image_col) out_df = pd.DataFrame({image_col: encoded_images}) if label is not None: out_df[label_col] = label return {'out_table': out_df}
def evaluate_ranking_algorithm(table1, table2, user_col, item_col, evaluation_measure, rating_col=None, rating_edge=None, k_values=None): none_str = 'None' item_encoder = preprocessing.LabelEncoder() tmp_table_item_col = table1[item_col].values.tolist() tmp_table_item_col.append(none_str) item_encoder.fit(tmp_table_item_col) if table2.columns[0] != 'user_name' and table2.columns[0] != 'user': raise_runtime_error("topN-list data schema should consist of [user_name, item_top1, rating_top1, .... item_topN, rating_topN]") user_encoder = preprocessing.LabelEncoder() user_encoder.fit(table2[table2.columns[0]]) if rating_col is not None and rating_edge is not None: table = table1[table1[rating_col] > rating_edge] else: table = table1 table = table[table[user_col].isin(user_encoder.classes_)] table_user_col = table[user_col] table_item_col = table[item_col] user_correspond = user_encoder.transform(table_user_col) item_correspond = item_encoder.transform(table_item_col) documents = dict() for i in range(len(user_encoder.classes_)): documents[i] = [] for i in range(len(user_correspond)): documents[user_correspond[i]].append(item_correspond[i]) columns = [] for i in range(int(len(table2.columns) / 2)): if table2.columns[2 * i + 1] != 'item_%d' % (i + 1) and table2.columns[2 * i + 2] != 'rating_%d' % (i + 1) and table2.columns[2 * i + 1] != 'item_top%d' % (i + 1) and table2.columns[2 * i + 2] != 'rating_top%d' % (i + 1): raise_runtime_error("topN-list data schema should consist of [user_name, item_top1, rating_top1, .... item_topN, rating_topN]") columns.append(table2.columns[2 * i + 1]) recommend_table = table2[columns].replace('', none_str).fillna(none_str).values for i in range(len(recommend_table)): recommend_table[i] = item_encoder.transform(recommend_table[i]) result = [] num_users = len(user_encoder.classes_) if k_values is not None: if 'prec' in evaluation_measure: for k_value in k_values: result.append(['precision_{}'.format(k_value), _precision_k(k_value, num_users, documents, recommend_table)]) if 'ndcg' in evaluation_measure: for k_value in k_values: result.append(['ndcg_{}'.format(k_value), _ndcg_k(k_value, num_users, documents, recommend_table)]) if 'map' in evaluation_measure: result.append(['meanAveragePrecision', _map(num_users, documents, recommend_table)]) result = pd.DataFrame(result, columns=['measure', 'value']) return {'out_table':result}
def _kmeans_predict(table, model, prediction_col='prediction'): if model['_context'] == 'python' and model['_type'] == 'kmeans': k_means = model['model'] input_cols = model['input_cols'] predict = k_means.predict(table[input_cols]) out_table = table.copy() out_table[prediction_col] = predict elif model['_context'] == 'python' and model['_type'] == 'kmeans_silhouette': k_means = model['best_model'] input_cols = model['input_cols'] predict = k_means.predict(table[input_cols]) out_table = table.copy() out_table[prediction_col] = predict else: raise_runtime_error("Unsupported model") # raise Exception("Unsupported model") return {'out_table':out_table}
def read_from_db(datasource, sql): if sql is None: raise_runtime_error('sql is required parameter') import re sqlToken = re.sub(' +', ' ', sql.lower().replace("(", " ( ").replace(")", " ) ")).replace( ". ", ".").split(" ") for i in range(len(sqlToken)): if sqlToken[i] == 'from': tmp_token = sqlToken[i + 1].split('.') if len(tmp_token) == 2 and tmp_token[1] in sys_table_lists: raise Exception( 'Cannot access system tables from Brightics: {}'.format( sqlToken[i + 1])) with DbEngine(**datasource) as engine: df = pd.read_sql_query(sql, engine) util.validate_column_name(df) return {'table': df}
def image_unload(table, input_col, path, type='png', label_col=None, labelling='dir'): if not _is_image_col(table, input_col): raise_runtime_error( '{} is not an image type column.'.format(input_col)) pathlib.Path(path).mkdir(parents=True, exist_ok=True) if label_col is None: for i, x in enumerate(table[input_col]): img_npy = Image.from_bytes(x).data # if type == 'png': out_file_name = '{}/{}.{}'.format(path, i, type) print(out_file_name) cv2.imwrite(out_file_name, img_npy)
def unload(table, partial_path, mode="overwrite"): path = data_utils.make_data_path_from_key(partial_path[0]) if path == gateway.data_root or path == gateway.data_root + '/': raise_runtime_error( 'Please check a path String and a type of path. Cannot use a root of directory for the path.' ) if os.path.isdir(path): shutil.rmtree(path) if mode == "append": try: old_frame = table_reader.read_parquet( util.make_data_path_from_key(partial_path[0])) new_frame = pd.concat([old_frame, table], axis=0, ignore_index=True) _write_dataframe(new_frame, path) except: _write_dataframe(table, path) else: _write_dataframe(table, path)
def _outlier_detection_lof_model(table, model, new_column_name='is_outlier'): out_table = table.copy() result_type = model['result_type'] isinlier = lambda _: 'in' if _ == 1 else 'out' out_table[new_column_name] = [isinlier(lof_predict) for lof_predict in model['lof_model'].predict(out_table[model['input_cols']])] # result_type is one of 'add_prediction', 'remove_outliers', 'both' if result_type == 'add_prediction': pass elif result_type == 'remove_outliers': out_table = out_table[out_table[new_column_name] == 'in'] out_table = out_table.drop(new_column_name, axis=1) elif result_type == 'both': out_table = out_table[out_table[new_column_name] == 'in'] else: raise_runtime_error("Please check 'result_type'.") return {'out_table' : out_table}
def _term_term_mtx(table, model, input_col, result_type='sparse'): corpus = table[input_col].tolist() dictionary = model['dictionary'] bow_corpus = [] for doc in corpus: bow_corpus.append(dictionary.doc2bow(doc)) csr_matrix = matutils.corpus2csc(bow_corpus).T csr_matrix.data = np.array([1 for _ in range(len(csr_matrix.data))]) term_term = (csr_matrix.T @ csr_matrix).tocoo() if result_type == 'sparse': term_term = sparse.triu(term_term, k=1) out_table = pd.DataFrame([dictionary[i] for i in term_term.row], columns=['term1']) out_table['term2'] = [dictionary[j] for j in term_term.col] out_table['number_of_documents_containing_terms'] = term_term.data elif result_type == 'dense': if model['add_words'] is None: model['add_words'] = [] num_origin = len(dictionary) - len(model['add_words']) terms = [term for term in dictionary.token2id.keys()][:num_origin] doc_idx = ['doc_{}'.format(i) for i in range(len(corpus))] out_table = pd.DataFrame(term_term.todense()) out_table.insert(loc=0, column=' ', value=terms) out_table.columns = np.append(" ", terms) else: raise_runtime_error("Please check 'result_type'.") rb = BrtcReprBuilder() model = _model_dict('term_term_mtx') model['term_term_mtx'] = term_term model['_repr_brtc_'] = rb.get() return {'out_table': out_table}
def _explode(table, input_col): col = table[input_col] is_arr_col_list = [ isinstance(item, np.ndarray) or isinstance(item, list) for item in col ] is_non_arr_col_list = [not item for item in is_arr_col_list] is_arr_col = all(is_arr_col_list) is_non_arr_col = all(is_non_arr_col_list) if not is_arr_col and not is_non_arr_col: raise_runtime_error( "{} is an invalid column to explode or un-explode.".format( input_col)) elif is_arr_col: # explode values = np.array(col) values_flattened = np.concatenate(values).ravel() counts = [len(item) for item in values] col_exploded = pd.Series(values_flattened, index=col.index.repeat(counts), name=col.name) out_table = table.drop([input_col], axis=1).join(col_exploded).reindex( columns=table.columns, copy=False) else: # un-explode group_cols = table.columns.tolist() group_cols.remove(input_col) group_id = 'tmp_group_id' while group_id in table.columns: group_id += '_' group_idx = table[group_cols].drop_duplicates() group_idx[group_id] = np.arange(group_idx.shape[0]) table = table.merge(group_idx, on=group_cols) out_table = table.groupby(group_id)[input_col].apply(list).reset_index(name=input_col) \ .merge(group_idx, on=group_id).reindex(columns=table.columns).drop(group_id, axis=1) out_table[input_col] = out_table[input_col].map( lambda lst: [item for item in lst if not isna(item)]) return {'out_table': out_table}
def _outlier_detection_tukey_carling(table, input_cols, outlier_method='tukey', multiplier=None, number_of_removal=1, result_type='add_prediction', new_column_prefix='is_outlier_'): out_table = table.copy() median = out_table.median() q1s = out_table.quantile(0.25) q3s = out_table.quantile(0.75) iqrs = q3s - q1s output_col_names = [] if outlier_method == 'tukey': if multiplier is None: multiplier = 1.5 for col in input_cols: output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col) output_col_names.append(output_col_name) out_table[output_col_name] = out_table[col].apply( lambda _: _tukey(_, q1s[col], q3s[col], iqrs[col], multiplier)) elif outlier_method == 'carling': if multiplier is None: multiplier = 2.3 for col in input_cols: output_col_name = '{prefix}{col}'.format(prefix=new_column_prefix, col=col) output_col_names.append(output_col_name) out_table[output_col_name] = out_table[col].apply( lambda _: _carling(_, median[col], iqrs[col], multiplier)) else: raise_runtime_error("Please check 'outlier_method'.") # result_type is one of 'add_prediction', 'remove_outliers', 'both' if result_type == 'add_prediction': pass elif result_type == 'remove_outliers': prediction = out_table[output_col_names].apply( lambda row: np.sum(row == 'out') < number_of_removal, axis=1) out_table = out_table[prediction.values] out_table = out_table.drop(output_col_names, axis=1) elif result_type == 'both': prediction = out_table[output_col_names].apply( lambda row: np.sum(row == 'out') < number_of_removal, axis=1) out_table = out_table[prediction.values] else: raise_runtime_error("Please check 'result_type'.") params = { 'Input Columns': input_cols, 'Outlier Method': outlier_method, 'Multiplier': multiplier, 'Number of Outliers in a Row': number_of_removal, 'Result Type': result_type, 'New Column Prefix': new_column_prefix } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Outlier Detection (Tukey/Carling) Result | ### Parameters | | {display_params} | """.format(display_params=dict2MD(params)))) model = _model_dict('outlier_detection_tukey_carling') model['params'] = params model['input_cols'] = input_cols model['outlier_method'] = outlier_method model['multiplier'] = multiplier model['number_of_removal'] = number_of_removal model['result_type'] = result_type model['median'] = median model['q1'] = q1s model['q3'] = q3s model['iqr'] = iqrs model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'model': model}
def _check_image_col(table, input_col): if not _is_image_col(table, input_col): raise_runtime_error( "input column {} is not an image column.".format(input_col))
def _cross_table(table, input_cols_1, input_cols_2, result='N', margins=False): df1 = [table[col] for col in input_cols_1] df2 = [table[col] for col in input_cols_2] # cross table if result == 'N': result_table = pd.crosstab(df1, df2, margins=margins) elif result == 'N / Row Total': result_table = pd.crosstab(df1, df2, margins=margins, normalize='index') elif result == 'N / Column Total': result_table = pd.crosstab(df1, df2, margins=margins, normalize='columns') elif result == 'N / Total': result_table = pd.crosstab(df1, df2, margins=margins, normalize='all') else: raise_runtime_error("Please check 'result'.") # each row and column name row_names = list(result_table.index)[:] if len(input_cols_1) == 1: joined_row_name = [str(i) for i in row_names] else: if margins == False: joined_row_name = [ '_'.join(str(s) for s in row_names[i]) for i in range(len(row_names)) ] elif margins == True: joined_row_name = [ '_'.join(str(s) for s in row_names[i]) for i in range(len(row_names) - 1) ] + [row_names[-1][0]] column_names = list(result_table.columns)[:] if len(input_cols_2) == 1: joined_column_name = [str(i) for i in column_names] else: if margins == False: joined_column_name = [ '_'.join(str(s) for s in column_names[i]) for i in range(len(column_names)) ] elif margins == True: joined_column_name = [ '_'.join(str(s) for s in column_names[i]) for i in range(len(column_names) - 1) ] + [column_names[-1][0]] # cross table if result == 'N': result_table.insert(loc=0, column=' ', value=joined_row_name) result_table.columns = np.append('N', joined_column_name) # cross table normalize by row elif result == 'N / Row Total': result_table.insert(loc=0, column=' ', value=joined_row_name) result_table.columns = np.append('N / Row Total', joined_column_name) # cross table normalize by column elif result == 'N / Column Total': result_table.insert(loc=0, column=' ', value=joined_row_name) result_table.columns = np.append('N / Column Total', joined_column_name) # cross table normalize by all values elif result == 'N / Total': result_table.insert(loc=0, column=' ', value=joined_row_name) result_table.columns = np.append('N / Total', joined_column_name) else: raise_runtime_error("Please check 'result'.") rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Cross Table Result | ### Result Type : {result} | | #### Result Table | | {result_table} | """.format(result=result, result_table=pandasDF2MD(result_table, num_rows=len(result_table.index) + 1)))) model = _model_dict('cross_table') model['result'] = result model['result_table'] = result_table model['_repr_brtc_'] = rb.get() return {'model': model}
def raise_runtime_error(error_message, true_condition=False): common_validation.raise_runtime_error(error_message, true_condition)
def _penalized_linear_regression_train(table, feature_cols, label_col, regression_type='ridge', alpha=1.0, l1_ratio=0.5, fit_intercept=True, max_iter=1000, tol=0.0001, random_state=None): out_table = table.copy() feature_names, features = check_col_type(out_table, feature_cols) label = out_table[label_col] if regression_type == 'ridge': regression_model = Ridge(alpha=alpha, fit_intercept=fit_intercept, max_iter=None, tol=tol, solver='auto', random_state=random_state) elif regression_type == 'lasso': regression_model = Lasso(alpha=alpha, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random') elif regression_type == 'elastic_net': regression_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random') else: raise_runtime_error("Please check 'regression_type'.") regression_model.fit(features, label) out_table1 = pd.DataFrame([]) out_table1['x_variable_name'] = [variable for variable in feature_names] out_table1['coefficient'] = regression_model.fit(features, label).coef_ intercept = pd.DataFrame( [['intercept', regression_model.fit(features, label).intercept_]], columns=['x_variable_name', 'coefficient']) if fit_intercept == True: out_table1 = out_table1.append(intercept, ignore_index=True) predict = regression_model.predict(features) residual = label - predict out_table['predict'] = predict out_table['residual'] = residual if regression_type == 'elastic_net': params = { 'Feature Columns': feature_names, 'Label Column': label_col, 'Regression Type': regression_type, 'Regularization (Penalty Weight)': alpha, 'L1 Ratio': l1_ratio, 'Fit Intercept': fit_intercept, 'Maximum Number of Iterations': max_iter, 'Tolerance': tol } else: params = { 'Feature Columns': feature_names, 'Label Column': label_col, 'Regression Type': regression_type, 'Regularization (Penalty Weight)': alpha, 'Fit Intercept': fit_intercept, 'Maxium Number of Iterations': max_iter, 'Tolerance': tol } score = { 'MSE': mean_squared_error(label, predict), 'R2': r2_score(label, predict) } plt.figure() plt.scatter(predict, label) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Actual values for ' + label_col) x = predict p1x = np.min(x) p2x = np.max(x) plt.plot([p1x, p2x], [p1x, p2x], 'r--') fig_actual_predict = plt2MD(plt) plt.clf() plt.figure() plt.scatter(predict, residual) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Residuals') plt.axhline(y=0, color='r', linestyle='--') fig_residual_1 = plt2MD(plt) plt.clf() plt.figure() sm.qqplot(residual, line='s') plt.ylabel('Residuals') fig_residual_2 = plt2MD(plt) plt.clf() plt.figure() sns.distplot(residual) plt.xlabel('Residuals') fig_residual_3 = plt2MD(plt) plt.clf() # checking the magnitude of coefficients plt.figure() predictors = feature_names coef = Series(regression_model.coef_, predictors).sort_values() coef.plot(kind='bar', title='Model Coefficients') plt.tight_layout() fig_model_coefficients = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | # Penalized Linear Regression Result | ### Selected Parameters: | {params} | | ## Results | ### Model Parameters | {out_table1} | | ### Regression Score | {score} | """.format(params=dict2MD(params), out_table1=pandasDF2MD(out_table1), score=dict2MD(score)))) rb.addMD( strip_margin(""" | | ### Predicted vs Actual | {image1} | | ### Fit Diagnostics | {image2} | {image3} | {image4} | | ### Magnitude of Coefficients | {image5} | """.format(image1=fig_actual_predict, image2=fig_residual_1, image3=fig_residual_2, image4=fig_residual_3, image5=fig_model_coefficients))) model = _model_dict('penalized_linear_regression_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['regression_type'] = regression_type model['regression_model'] = regression_model model['parameters'] = params model['model_parameters'] = out_table1 model['prediction_residual'] = out_table model['_repr_brtc_'] = rb.get() return {'model': model}
def _lda4(table, input_col, topic_name='topic', num_voca=1000, num_topic=5, num_topic_word=10, max_iter=20, learning_method='online', learning_offset=10., random_state=None): # generate model corpus = np.array(table[input_col]) if isinstance(corpus[0], np.ndarray): tf_vectorizer = CountVectorizer(preprocessor=' '.join, stop_words='english', max_df=0.95, min_df=2, max_features=num_voca) else: tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") log_likelihood = lda_model.score(term_count) perplexity = lda_model.perplexity(term_count) # create topic table vocab_weights_list = [] vocab_list = [] weights_list = [] topic_term_prob = normalize(lda_model.components_, norm='l1') for vector in topic_term_prob: pairs = [] for term_idx, value in enumerate(vector): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) vocab_weights = [] vocab = [] weights = [] for pair in pairs[:num_topic_word]: vocab_weights.append("{}: {}".format(pair[1], pair[0])) vocab.append(pair[1]) weights.append(pair[0]) vocab_weights_list.append(vocab_weights) vocab_list.append(vocab) weights_list.append(weights) topic_table = pd.DataFrame({ 'vocabularies_weights': vocab_weights_list, 'vocabularies': vocab_list, 'weights': weights_list }) topic_table['index'] = [idx + 1 for idx in topic_table.index] topic_table = topic_table[[ 'index', 'vocabularies_weights', 'vocabularies', 'weights' ]] # create output table doc_topic = lda_model.transform(term_count) out_table = pd.DataFrame.copy(table, deep=True) topic_dist_name = topic_name + '_distribution' if topic_name in table.columns or topic_dist_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Existing table contains Topic Column Name. Please choose again." }]) out_table[topic_name] = [ doc_topic[i].argmax() + 1 for i in range(len(corpus)) ] out_table[topic_dist_name] = doc_topic.tolist() # pyLDAvis prepared_data = ldavis.prepare(lda_model, term_count, tf_vectorizer) html_result = pyLDAvis.prepared_data_to_html(prepared_data) # generate report params = { 'Input column': input_col, 'Topic column name': topic_name, 'Number of topics': num_topic, 'Number of words for each topic': num_topic_word, 'Maximum number of iterations': max_iter, 'Learning method': learning_method, 'Learning offset': learning_offset, 'Seed': random_state } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Latent Dirichlet Allocation Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD( strip_margin(""" | | ### Log Likelihood | {log_likelihood} | | ### Perplexity | {perplexity} | | ### Parameters | {params} """.format(log_likelihood=log_likelihood, perplexity=perplexity, params=dict2MD(params)))) # create model model = _model_dict('lda_model') model['params'] = params model['lda_model'] = lda_model model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
def _lda(table, input_col, num_voca=1000, num_topic=3, num_topic_word=3, max_iter=20, learning_method='online', learning_offset=10., random_state=None): corpus = table[input_col] tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") topic_model = pd.DataFrame([]) topic_idx_list = [] voca_weights_list = [] for topic_idx, weights in enumerate(lda_model.components_): topic_idx_list.append("Topic {}".format(topic_idx)) pairs = [] for term_idx, value in enumerate(weights): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) voca_weights = [] for pair in pairs[:num_topic_word]: voca_weights.append("{}: {}".format(pair[1], pair[0])) voca_weights_list.append(voca_weights) topic_model['topic idx'] = topic_idx_list topic_model['topic vocabularies'] = voca_weights_list doc_topic = lda_model.transform(term_count) doc_classification = pd.DataFrame() doc_classification['documents'] = [doc for doc in corpus] doc_classification['top topic'] = [ "Topic {}".format(doc_topic[i].argmax()) for i in range(len(corpus)) ] params = { 'Input Column': input_col, 'Number of Vocabularies': num_voca, 'Number of Topics': num_topic, 'Number of Terminologies': num_topic_word, 'Iterations': max_iter, 'Learning Method': learning_method, } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Latent Dirichlet Allocation Result""")) rb.addMD( strip_margin(""" | |### Parameters | | {display_params} | |### Topic Model | |{topic_model} | |### Documents Classification | |{doc_classification} | """.format(display_params=dict2MD(params), topic_model=pandasDF2MD(topic_model, num_rows=num_topic + 1), doc_classification=pandasDF2MD(doc_classification, num_rows=len(corpus) + 1)))) model = _model_dict('lda') model['parameter'] = params model['topic_model'] = topic_model model['documents_classification'] = doc_classification model['_repr_brtc_'] = rb.get() return {'model': model}
def _lda3(table, input_col, topic_name='topic', num_voca=1000, num_topic=3, num_topic_word=3, max_iter=20, learning_method='online', learning_offset=10., random_state=None): corpus = np.array(table[input_col]) if isinstance(corpus[0], np.ndarray): tf_vectorizer = CountVectorizer(preprocessor=' '.join, stop_words='english', max_df=0.95, min_df=2, max_features=num_voca) else: tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") voca_weights_list = [] for weights in lda_model.components_: pairs = [] for term_idx, value in enumerate(weights): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) voca_weights = [] for pair in pairs[:num_topic_word]: voca_weights.append("{}: {}".format(pair[1], pair[0])) voca_weights_list.append(voca_weights) doc_topic = lda_model.transform(term_count) out_table = pd.DataFrame.copy(table, deep=True) if topic_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Existing table contains Topic Column Name. Please choose again." }]) out_table[topic_name] = [doc_topic[i].argmax() for i in range(len(corpus))] weight_list = [] for ind in out_table[topic_name]: weight_list.append(voca_weights_list[ind]) out_table['topic_vocabularies'] = weight_list return {'out_table': out_table}
def _hierarchical_clustering(table, input_cols, input_mode='original', key_col=None, link='complete', met='euclidean', num_rows=20, figure_height=6.4, orient='right'): out_table = table.copy() features = out_table[input_cols] if input_mode == 'original': len_features = len(features) if key_col != None: data_names = list(out_table[key_col]) elif key_col == None: data_names = ['pt_' + str(i) for i in range(len_features)] out_table['name'] = data_names Z = linkage(ssd.pdist(features, metric=met), method=link, metric=met) elif input_mode == 'matrix': len_features = len(input_cols) if key_col != None: data_names = [] for column in input_cols: data_names.append( out_table[key_col][out_table.columns.get_loc(column)]) elif key_col == None: data_names = [] for column in input_cols: data_names.append( out_table.columns[out_table.columns.get_loc(column)]) col_index = [] for column in input_cols: col_index.append(out_table.columns.get_loc(column)) dist_matrix = features.iloc[col_index] Z = linkage(ssd.squareform(dist_matrix), method=link, metric=met) dist_matrix['name'] = data_names else: raise_runtime_error("Please check 'input_mode'.") range_len_Z = range(len(Z)) linkage_matrix = pd.DataFrame([]) linkage_matrix['linkage step'] = [ '%g' % (x + 1) for x in reversed(range_len_Z) ] linkage_matrix['name of clusters'] = [ 'CL_%g' % (i + 1) for i in reversed(range_len_Z) ] joined_column1 = [] for i in range_len_Z: if Z[:, 0][i] < len_features: joined_column1.append(data_names[int(Z[:, 0][i])]) elif Z[:, 0][i] >= len_features: joined_column1.append( linkage_matrix['name of clusters'][Z[:, 0][i] - len_features]) linkage_matrix['joined column1'] = joined_column1 joined_column2 = [] for i in range_len_Z: if Z[:, 1][i] < len_features: joined_column2.append(data_names[int(Z[:, 1][i])]) elif Z[:, 1][i] >= len_features: joined_column2.append( linkage_matrix['name of clusters'][Z[:, 1][i] - len_features]) linkage_matrix['joined column2'] = joined_column2 linkage_matrix['distance'] = [distance for distance in Z[:, 2]] linkage_matrix['number of original'] = [ int(entities) for entities in Z[:, 3] ] linkage_matrix = linkage_matrix.reindex( index=linkage_matrix.index[::-1])[0:] # calculate full dendrogram plt.figure(figsize=(8.4, figure_height)) dendrogram(Z, truncate_mode='none', get_leaves=True, orientation=orient, labels=data_names, leaf_rotation=45, leaf_font_size=10., show_contracted=False) plt.title('Hierarchical Clustering Dendrogram') if orient == 'top': plt.xlabel('Samples') plt.ylabel('Distance') elif orient == 'right': plt.xlabel('Distance') plt.ylabel('Samples') plt.tight_layout() plt2 = plt2MD(plt) plt.clf() params = { 'Input Columns': input_cols, 'Input Mode': input_mode, 'Linkage Method': link, 'Metric': met, 'Number of Rows in Linkage Matrix': num_rows } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Hierarchical Clustering Result""")) rb.addMD( strip_margin(""" |### Dendrogram | |{image} | |### Parameters | |{display_params} | |### Linkage Matrix | |{out_table1} | """.format(image=plt2, display_params=dict2MD(params), out_table1=pandasDF2MD(linkage_matrix.head(num_rows), num_rows=num_rows + 1)))) model = _model_dict('hierarchical_clustering') model['model'] = Z model['input_mode'] = input_mode model['table'] = out_table if input_mode == 'matrix': model['dist_matrix'] = dist_matrix model['parameters'] = params model['linkage_matrix'] = linkage_matrix model['_repr_brtc_'] = rb.get() return {'model': model}