def read_from_db(datasource, sql): if sql is None: raise_runtime_error('sql is required parameter') with DbEngine(**datasource) as engine: df = pd.read_sql_query(sql, engine) util.validate_column_name(df) return {'table': df}
def _logistic_regression_train(table, feature_cols, label_col, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1): features = table[feature_cols] label = table[label_col] if(sklearn_utils.multiclass.type_of_target(label) == 'continuous'): raise_runtime_error('''Label Column should not be continuous.''') lr_model = LogisticRegression(penalty, dual, tol, C, fit_intercept, intercept_scaling, class_weight, random_state, solver, max_iter, multi_class, verbose, warm_start, n_jobs) lr_model.fit(features, label) featureNames = np.append("Intercept", feature_cols) intercept = lr_model.intercept_ coefficients = lr_model.coef_ classes = lr_model.classes_ is_binary = len(classes) == 2 if (fit_intercept == True): summary = pd.DataFrame({'features': ['intercept'] + feature_cols}) print(intercept) print(coefficients) coef_trans = np.concatenate(([intercept], np.transpose(coefficients)), axis=0) if not is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) else: summary = pd.DataFrame({'features': feature_cols}) coef_trans = np.transpose(coefficients) if not is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=classes)), axis=1) elif is_binary: summary = pd.concat((summary, pd.DataFrame(coef_trans, columns=[classes[0]])), axis=1) prob = lr_model.predict_proba(features) rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Logistic Regression Result | ### Summary | {table1} """.format(table1=pandasDF2MD(summary) ))) model = dict() model['features'] = feature_cols model['label'] = label_col model['intercept'] = lr_model.intercept_ model['coefficients'] = lr_model.coef_ model['class'] = lr_model.classes_ model['penalty'] = penalty model['solver'] = solver model['lr_model'] = lr_model model['_repr_brtc_'] = rb.get() return {'model' : model}
def _svm_classification_train(table, feature_cols, label_col, c=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, probability=True, tol=1e-3, max_iter=-1, random_state=None): validate(greater_than(c, 0.0, 'c')) _table = table.copy() _feature_cols = _table[feature_cols] _label_col = _table[label_col] if (sklearn_utils.multiclass.type_of_target(_label_col) == 'continuous'): raise_runtime_error('''Label Column should not be continuous.''') _svc = svm.SVC(C=c, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, shrinking=shrinking, probability=probability, tol=tol, max_iter=max_iter, random_state=random_state) _svc_model = _svc.fit(_feature_cols, _label_col) get_param = _svc.get_params() get_param['feature_cols'] = feature_cols get_param['label_col'] = label_col rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## SVM Classification Result | ### Parameters | {table_parameter} """.format(table_parameter=dict2MD(get_param)))) _model = _model_dict('svc_model') _model['svc_model'] = _svc_model _model['features'] = feature_cols _model['_repr_brtc_'] = rb.get() return {'model': _model}
def _kmeans_predict(table, model, prediction_col='prediction'): if model['_context'] == 'python' and model['_type'] == 'kmeans': k_means = model['model'] input_cols = model['input_cols'] predict = k_means.predict(table[input_cols]) out_table = table.copy() out_table[prediction_col] = predict elif model['_context'] == 'python' and model['_type'] == 'kmeans_silhouette': k_means = model['best_model'] input_cols = model['input_cols'] predict = k_means.predict(table[input_cols]) out_table = table.copy() out_table[prediction_col] = predict else: raise_runtime_error("Unsupported model") # raise Exception("Unsupported model") return {'out_table':out_table}
def _one_hot_encoder(table, input_cols, prefix='list', prefix_list=None, suffix='index', n_values='auto', categorical_features='all', sparse=True, handle_unknown='error'): out_table = table.copy() sparse = False enc_list = [] le_list = [] prefix_list_index = 0 if prefix == 'list': len_prefix_list = 0 if prefix_list is None else len(prefix_list) if len(input_cols) != len_prefix_list: # TODO: make the error message code raise_runtime_error( 'The number of Input Columns and the numnber of Prefixes should be equal.' ) for col_name in input_cols: enc = OneHotEncoder(n_values=n_values, categorical_features=categorical_features, sparse=sparse, handle_unknown=handle_unknown) le = LabelEncoder() new_col_names = [] if suffix == 'index': if prefix == 'list': for i in range(0, len(np.unique(out_table[col_name].values))): new_col_names.append(prefix_list[prefix_list_index] + '_' + str(i)) else: for i in range(0, len(np.unique(out_table[col_name].values))): new_col_names.append(col_name + '_' + str(i)) else: if prefix == 'list': for stri in np.unique(out_table[col_name].values): new_col_names.append(prefix_list[prefix_list_index] + '_' + stri) else: for stri in np.unique(out_table[col_name].values): new_col_names.append(col_name + '_' + stri) out_table = pd.concat([ out_table.reset_index(drop=True), pd.DataFrame(enc.fit_transform( le.fit_transform(out_table[col_name]).reshape(-1, 1)), columns=new_col_names) ], axis=1) enc_list.append(enc) le_list.append(le) prefix_list_index = prefix_list_index + 1 out_model = _model_dict('one_hot_encoder') out_model['one_hot_encoder_list'] = enc_list out_model['label_encoder_list'] = le_list out_model['input_cols'] = input_cols out_model['classes'] = le.classes_ out_model['active_features'] = enc.active_features_ out_model['feature_indices'] = enc.feature_indices_ out_model['n_values'] = enc.n_values_ out_model['prefix'] = prefix out_model['prefix_list'] = prefix_list out_model['suffix'] = suffix return {'out_table': out_table, 'model': out_model}
def write_to_db(table, tableName, datasource, ifExists='fail'): if not isinstance(table, pd.DataFrame): raise_runtime_error('table is not pandas.DataFrame') with DbEngine(**datasource) as engine: table.to_sql(tableName, engine, if_exists=ifExists, index=False)
def _glm_train(table, feature_cols, label_col, family="Gaussian", link="ident", fit_intercept=True): features = table[feature_cols] label = table[label_col] if label_col in feature_cols: raise_runtime_error("%s is duplicated." % label_col) if family == "Gaussian": sm_family = sm.families.Gaussian() elif family == "inv_Gaussian": sm_family = sm.families.InverseGaussian() elif family == "binomial": sm_family = sm.families.Binomial() elif family == "Poisson": sm_family = sm.families.Poisson() elif family == "neg_binomial": sm_family = sm.families.NegativeBinomial() elif family == "gamma": sm_family = sm.families.Gamma() elif family == "Tweedie": sm_family = sm.families.Tweedie() if link == "ident": sm_link = sm.families.links.identity elif link == "log": sm_link = sm.families.links.log elif link == "logit": sm_link = sm.families.links.logit elif link == "probit": sm_link = sm.families.links.probit elif link == "cloglog": sm_link = sm.families.links.cLogLog elif link == "pow": sm_link = sm.families.links.Power elif link == "nbinom": sm_link = sm.families.links.binom if fit_intercept == True: glm_model = sm.GLM(label, sm.add_constant(features), family=sm_family, link=sm_link).fit() else: glm_model = sm.GLM(label, features, family=sm_family, link=sm_link).fit() summary = glm_model.summary().as_html() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## GLM Result | ### Summary | """)) rb.addHTML(summary) model = _model_dict('glm_model') model['features'] = feature_cols model['label'] = label_col model['family'] = family model['link'] = link model['coefficients'] = glm_model.params model['aic'] = glm_model.aic model['bic'] = glm_model.bic model['tvalues'] = glm_model.tvalues model['pvalues'] = glm_model.pvalues model['fit_intercept'] = fit_intercept model['glm_model'] = glm_model model['_repr_brtc_'] = rb.get() return {'model': model}
def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting_scheme='inverseDocumentFrequency', norm='l2', smooth_idf=True, sublinear_tf=False, output_type=False): corpus = table[input_col] if max_df == None: max_df = len(corpus) tf_vectorizer = CountVectorizer(stop_words='english', max_df=max_df, min_df=min_df, max_features=num_voca) tf_vectorizer.fit(corpus) voca_dict = tf_vectorizer.vocabulary_ tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=max_df, min_df=min_df, max_features=num_voca, norm=norm, use_idf=True, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) tfidf_vectorizer.fit(corpus) tf_feature_names = tf_vectorizer.get_feature_names() idf_table = pd.DataFrame() idf_table['vocabulary'] = tf_feature_names if idf_weighting_scheme == 'inverseDocumentFrequency': idf_table['idf weight'] = tfidf_vectorizer.idf_.tolist() elif idf_weighting_scheme == 'unary': idf_table['idf weight'] = float(1) tfidf_table = pd.DataFrame() for doc in range(len(corpus)): each_tfidf_table = pd.DataFrame() each_tfidf_table[input_col] = [ str(corpus[doc]) for j in range(len(voca_dict.keys())) ] each_tfidf_table['vocabulary'] = voca_dict.keys() each_tfidf_table['index'] = voca_dict.values() each_tfidf_table['frequency'] = [ np.ravel(tf_vectorizer.transform([corpus[doc]]).toarray())[idx] for idx in voca_dict.values() ] if idf_weighting_scheme == 'inverseDocumentFrequency': each_tfidf_table['tfidf score'] = [ np.ravel(tfidf_vectorizer.transform([corpus[doc] ]).toarray())[idx] for idx in voca_dict.values() ] elif idf_weighting_scheme == 'unary': each_tfidf_table['tfidf score'] = [ np.ravel(tfidf_vectorizer.transform([corpus[doc] ]).toarray())[idx] / float(tfidf_vectorizer.idf_[idx]) for idx in voca_dict.values() ] each_tfidf_table = each_tfidf_table.sort_values(by=['index'], axis=0) tfidf_table = pd.concat([tfidf_table, each_tfidf_table], axis=0) if output_type == False: pass elif output_type == True: remain_idx = tfidf_table['frequency'].apply(lambda x: x != 0) tfidf_table = tfidf_table[remain_idx.values] else: raise_runtime_error("Please check 'output_type'.") params = { 'Input Column': input_col, 'Max DF': max_df, 'Min DF': min_df, 'Number of Vocabularies': num_voca, 'IDF Weighting Scheme': idf_weighting_scheme, 'Norm': norm, 'Smooth IDF': smooth_idf, 'Sublinear TF': sublinear_tf, 'Remove Zero Counts': output_type } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# TF-IDF Result""")) rb.addMD( strip_margin(""" | |### Parameters | |{display_params} | |### IDF Table | |{idf_table} | |### TFIDF Table | |{tfidf_table} | """.format(display_params=dict2MD(params), idf_table=pandasDF2MD(idf_table, num_rows=len(tf_feature_names) + 1), tfidf_table=pandasDF2MD( tfidf_table, num_rows=len(tf_feature_names) * len(corpus) + 1)))) model = _model_dict('tfidf') model['idf_table'] = idf_table model['tfidf_table'] = tfidf_table model['_repr_brtc_'] = rb.get() return {'model': model}
def _lda(table, input_col, num_voca=1000, num_topic=3, num_topic_word=3, max_iter=20, learning_method='online', learning_offset=10., random_state=None): corpus = table[input_col] tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation(n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation(n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") topic_model = pd.DataFrame([]) topic_idx_list = [] voca_weights_list = [] for topic_idx, weights in enumerate(lda_model.components_): topic_idx_list.append("Topic {}".format(topic_idx)) pairs = [] for term_idx, value in enumerate(weights): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) voca_weights = [] for pair in pairs[:num_topic_word]: voca_weights.append("{}: {}".format(pair[1], pair[0])) voca_weights_list.append(voca_weights) topic_model['topic idx'] = topic_idx_list topic_model['topic vocabularies'] = voca_weights_list doc_topic = lda_model.transform(term_count) doc_classification = pd.DataFrame() doc_classification['documents'] = [doc for doc in corpus] doc_classification['top topic'] = ["Topic {}".format(doc_topic[i].argmax()) for i in range(len(corpus))] params = { 'Input Column': input_col, 'Number of Vocabularies': num_voca, 'Number of Topics': num_topic, 'Number of Terminologies': num_topic_word, 'Iteration': max_iter, 'Learning Method': learning_method, } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Latent Dirichlet Allocation Result""")) rb.addMD(strip_margin(""" | |### Parameters | | {display_params} | |### Topic Model | |{topic_model} | |### Documents Classification | |{doc_classification} | """.format(display_params=dict2MD(params), topic_model=pandasDF2MD(topic_model, num_rows=num_topic + 1), doc_classification=pandasDF2MD(doc_classification, num_rows=len(corpus) + 1)))) model = _model_dict('lda') model['topic_model'] = topic_model model['documents_classification'] = doc_classification model['_repr_brtc_'] = rb.get() return {'model' : model}
def _hierarchical_clustering(table, input_cols, input_mode='original', key_col=None, link='complete', met='euclidean', num_rows=20, figure_height=6.4, orient='right'): out_table = table.copy() features = out_table[input_cols] if input_mode == 'original': len_features = len(features) if key_col != None: data_names = list(out_table[key_col]) elif key_col == None: data_names = ['pt_' + str(i) for i in range(len_features)] out_table['name']=data_names Z = linkage(ssd.pdist(features, metric=met), method=link, metric=met) elif input_mode == 'matrix': len_features = len(input_cols) if key_col != None: data_names = [] for column in input_cols: data_names.append(out_table[key_col][out_table.columns.get_loc(column)]) elif key_col == None: data_names = [] for column in input_cols: data_names.append(out_table.columns[out_table.columns.get_loc(column)]) col_index = [] for column in input_cols: col_index.append(out_table.columns.get_loc(column)) dist_matrix = features.iloc[col_index] Z = linkage(ssd.squareform(dist_matrix), method=link, metric=met) dist_matrix['name'] = data_names else: raise_runtime_error("Please check 'input_mode'.") range_len_Z = range(len(Z)) linkage_matrix = pd.DataFrame([]) linkage_matrix['linkage step'] = [x + 1 for x in reversed(range_len_Z)] linkage_matrix['name of clusters'] = ['CL_' + str(i + 1) for i in reversed(range_len_Z)] joined_column1 = [] for i in range_len_Z: if Z[:, 0][i] < len_features: joined_column1.append(data_names[int(Z[:, 0][i])]) elif Z[:, 0][i] >= len_features: joined_column1.append(linkage_matrix['name of clusters'][Z[:, 0][i] - len_features]) linkage_matrix['joined column1'] = joined_column1 joined_column2 = [] for i in range_len_Z: if Z[:, 1][i] < len_features: joined_column2.append(data_names[int(Z[:, 1][i])]) elif Z[:, 1][i] >= len_features: joined_column2.append(linkage_matrix['name of clusters'][Z[:, 1][i] - len_features]) linkage_matrix['joined column2'] = joined_column2 linkage_matrix['distance'] = [distance for distance in Z[:, 2]] linkage_matrix['number of original'] = [int(entities) for entities in Z[:, 3]] linkage_matrix = linkage_matrix.reindex(index=linkage_matrix.index[::-1])[0:] # calculate full dendrogram plt.figure(figsize=(8.4, figure_height)) dendrogram( Z, truncate_mode='none', get_leaves=True, orientation=orient, labels=data_names, leaf_rotation=45, leaf_font_size=10., show_contracted=False ) plt.title('Hierarchical Clustering Dendrogram') if orient == 'top': plt.xlabel('Samples') plt.ylabel('Distance') elif orient == 'right': plt.xlabel('Distance') plt.ylabel('Samples') plt.tight_layout() plt2 = plt2MD(plt) plt.clf() params = { 'Input Columns': input_cols, 'Input Mode': input_mode, 'Linkage Method': link, 'Metric': met, 'Number of Rows in Linkage Matrix': num_rows } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# Hierarchical Clustering Result""")) rb.addMD(strip_margin(""" |### Dendrogram | |{image} | |### Parameters | |{display_params} | |### Linkage Matrix | |{out_table1} | """.format(image=plt2, display_params=dict2MD(params), out_table1=pandasDF2MD(linkage_matrix.head(num_rows))))) model = _model_dict('hierarchical_clustering') model['model'] = Z model['input_mode'] = input_mode model['table']=out_table if input_mode == 'matrix': model['dist_matrix'] = dist_matrix model['parameters'] = params model['linkage_matrix'] = linkage_matrix model['_repr_brtc_'] = rb.get() return {'model':model}
def _penalized_linear_regression_train(table, feature_cols, label_col, regression_type='ridge', alpha=1.0, l1_ratio=0.5, fit_intercept=True, max_iter=1000, tol=0.0001, random_state=None): out_table = table.copy() features = out_table[feature_cols] label = out_table[label_col] if regression_type == 'ridge': regression_model = Ridge(alpha=alpha, fit_intercept=fit_intercept, max_iter=None, tol=tol, solver='auto', random_state=random_state) elif regression_type == 'lasso': regression_model = Lasso(alpha=alpha, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random') elif regression_type == 'elastic_net': regression_model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, random_state=random_state, selection='random') else: raise_runtime_error("Please check 'regression_type'.") regression_model.fit(features, label) out_table1 = pd.DataFrame([]) out_table1['x_variable_name'] = [variable for variable in feature_cols] out_table1['coefficient'] = regression_model.fit(features, label).coef_ intercept = pd.DataFrame([['intercept', regression_model.fit(features, label).intercept_]], columns=['x_variable_name', 'coefficient']) if fit_intercept == True: out_table1 = out_table1.append(intercept, ignore_index=True) predict = regression_model.predict(features) residual = label - predict out_table['predict'] = predict out_table['residual'] = residual if regression_type == 'elastic_net': params = { 'Feature Columns' : feature_cols, 'Label Column' : label_col, 'Regression Type': regression_type, 'Regularization (Penalty Weight)' : alpha, 'L1 Ratio': l1_ratio, 'Fit Intercept' : fit_intercept, 'Maximum Number of Iterations' : max_iter, 'Tolerance' : tol } else: params = { 'Feature Columns' : feature_cols, 'Label Column' : label_col, 'Regression Type': regression_type, 'Regularization (Penalty Weight)' : alpha, 'Fit Intercept' : fit_intercept, 'Maxium Number of Iterations' : max_iter, 'Tolerance' : tol } score = { 'MSE' : mean_squared_error(label, predict), 'R2' : r2_score(label, predict) } plt.figure() plt.scatter(predict, label) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Actual values for ' + label_col) x = predict p1x = np.min(x) p2x = np.max(x) plt.plot([p1x, p2x], [p1x, p2x], 'r--') fig_actual_predict = plt2MD(plt) plt.clf() plt.figure() plt.scatter(predict, residual) plt.xlabel('Predicted values for ' + label_col) plt.ylabel('Residuals') plt.axhline(y=0, color='r', linestyle='--') fig_residual_1 = plt2MD(plt) plt.clf() plt.figure() sm.qqplot(residual, line='s') plt.ylabel('Residuals') fig_residual_2 = plt2MD(plt) plt.clf() plt.figure() sns.distplot(residual) plt.xlabel('Residuals') fig_residual_3 = plt2MD(plt) plt.clf() # checking the magnitude of coefficients plt.figure() predictors = features.columns coef = Series(regression_model.coef_, predictors).sort_values() coef.plot(kind='bar', title='Model Coefficients') plt.tight_layout() fig_model_coefficients = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | # Penalized Linear Regression Result | ### Selected Parameters: | {params} | | ## Results | ### Model Parameters | {out_table1} | | ### Prediction and Residual | {out_table2} | | ### Regression Score | {score} | """.format(params=dict2MD(params), out_table1=pandasDF2MD(out_table1), out_table2=pandasDF2MD(out_table, num_rows=len(out_table) + 1), score=dict2MD(score)))) rb.addMD(strip_margin(""" | | ### Predicted vs Actual | {image1} | | ### Fit Diagnostics | {image2} | {image3} | {image4} | | ### Magnitude of Coefficients | {image5} | """.format(image1=fig_actual_predict, image2=fig_residual_1, image3=fig_residual_2, image4=fig_residual_3, image5=fig_model_coefficients ))) model = _model_dict('penalized_linear_regression_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['regression_type'] = regression_type model['regression_model'] = regression_model model['model_parameters'] = out_table1 model['prediction_residual'] = out_table model['_repr_brtc_'] = rb.get() return {'model' : model}
def pivot(table, values, aggfunc, index=None, columns=None): # TODO if index is None and columns is None: # TODO: assign an error code. raise_runtime_error('Group key value is required: Index or Columns.') def count(x): return len(x) def mean(x): return np.mean(x) def std(x): return np.std(x) def var(x): return np.var(x) def min(x): return np.min(x) def _25th(x): return np.percentile(x, 0.25) def median(x): return np.median(x) def _75th(x): return np.percentile(x, 0.75) def max(x): return np.max(x) def sum(x): return np.sum(x) def _mi2index(mi): return pd.Index([_replace_col(col) for col in mi.get_values()]) def _replace_col(tup): col = '__'.join(str(elem) for elem in tup) for char in ' ,;{}()\n\t=': col.replace(char, '') return col func_list = [] for func_name in aggfunc: if func_name == 'count': func_list.append(count) elif func_name == 'mean': func_list.append(mean) elif func_name == 'std': func_list.append(std) elif func_name == 'var': func_list.append(var) elif func_name == 'min': func_list.append(min) elif func_name == '_25th': func_list.append(_25th) elif func_name == 'median': func_list.append(median) elif func_name == '_75th': func_list.append(_75th) elif func_name == 'max': func_list.append(max) elif func_name == 'sum': func_list.append(sum) pivoted = pd.pivot_table(table, values=values, index=index, columns=columns, aggfunc=func_list, fill_value=None, margins=False, margins_name='All') pivoted.columns = _mi2index(pivoted.columns) out_table = pd.concat([pivoted.index.to_frame(), pivoted], axis=1) return {'out_table': out_table}