def svm_classification_train(table, group_by=None, **params): check_required_parameters(_svm_classification_train, params, ['table', 'gamma_val']) params = get_default_from_parameters_if_required( params, _svm_classification_train) if params['gamma'] == 'other': if 'gamma_val' not in params: raise BFE.from_errors([{ '0100': 'Gamma value is mandatory when gamma is other' }]) if params['gamma_val'] <= 0: raise BFE.from_errors([{ '0100': 'Gamma value must be greater than 0' }]) else: params['gamma_val'] = None param_validation_check = [ over_to(params, 0.0, 1.0, 'c'), greater_than_or_equal_to(params, 0, 'degree'), greater_than(params, 0.0, 'tol'), greater_than_or_equal_to_or_equal_to(params, 1, -1, 'max_iter') ] validate(*param_validation_check) if group_by is not None: grouped_model = _function_by_group(_svm_classification_train, table, group_by=group_by, **params) return grouped_model else: return _svm_classification_train(table, **params)
def _extend_datetime(table, input_col, impute_unit): arr_order = [] datetime_list = [] for ind, t_str in enumerate(table[input_col]): try: if impute_unit == 'year': arr_order.append(datetime(year=int(t_str[0:4]), month=1, day=1)) elif impute_unit == 'month': arr_order.append( datetime(year=int(t_str[0:4]), month=int(t_str[4:6]), day=1)) elif impute_unit == 'day': arr_order.append( datetime(year=int(t_str[0:4]), month=int(t_str[4:6]), day=int(t_str[6:8]))) elif impute_unit == 'hour': arr_order.append( datetime(year=int(t_str[0:4]), month=int(t_str[4:6]), day=int(t_str[6:8]), hour=int(t_str[8:10]))) elif impute_unit == 'minute': arr_order.append( datetime(year=int(t_str[0:4]), month=int(t_str[4:6]), day=int(t_str[6:8]), hour=int(t_str[8:10]), minute=int(t_str[10:12]))) datetime_list.append( datetime(year=int(t_str[0:4]), month=int(t_str[4:6]), day=int(t_str[6:8]), hour=int(t_str[8:10]), minute=int(t_str[10:12]), second=int(t_str[12:14]))) except: raise BFE.from_errors([{ '0100': 'Invalid Datetime format at column {}, index {}.'.format( input_col, ind + 1) }]) # check for ascending order # If not -> log message error. tmp = check_ascending(arr_order) if not tmp[0]: log_message = 'Date time coulumn should be in strictly ascending order with the unit {}. '.format( impute_unit) log_message += 'The following is the first five invalid data: {}'.format( table[input_col][tmp[1]:tmp[1] + 5].tolist()) raise BFE.from_errors([{'0100': log_message}]) out_table = insert_datetime(table.copy(), input_col, arr_order, datetime_list, impute_unit) return {'out_table': out_table}
def _search(table, user_dict=pd.DataFrame(), input_cols=[], search_words=[], synonym_dict=[], main_operator='and'): if len(search_words) == 0: raise BrighticsFunctionException('0033', 'Search Words') for search_word in search_words: if search_word is None: raise BrighticsFunctionException('0033', 'Search Words') _table = table.copy() filter_list = [] if len(input_cols) == 0: validate(require_param('input_cols')) for _list in product(input_cols, search_words): c, od = _list filter_list.append([c, od.strip('\'')]) _out_table = _table filtered_set = set(_out_table.index) cond = np.full(len(_table), True).tolist() for _filter in filter_list: cond = (cond) & (_table[_filter[0]].str.contains(_filter[1])) _out_table = _table.loc[list( filtered_set.intersection(set(_table[cond].index)))] if len(user_dict.index) != 0: filter_list = [] search_words = [ user_dict['value'][i] for i, key in enumerate(user_dict['key']) if key in search_words ] print(search_words) for _list in product(input_cols, search_words): c, od = _list filter_list.append([c, od.strip('\'')]) filtered_set = set() syno_cond = np.full(len(_table), False).tolist() for _filter in filter_list: syno_cond = (syno_cond) | (_table[_filter[0]].str.contains( _filter[1])) syno_cond = syno_cond | cond _out_table = _table.loc[list( filtered_set.union(set(_table[syno_cond].index)))] return {'out_table': _out_table}
def _datetime_formatter(table, input_cols, display_mode='replace', in_format="%Y%m%d%H%M%S", out_format="%Y-%m-%d %H:%M:%S", in_language="en_US", out_language="en_US"): import platform if platform.system()[:3].lower() == 'win': in_language = linux_window_change(in_language) out_language = linux_window_change(out_language) _in_format = format_dict[in_format] _out_format = format_dict[out_format] out_table = table.copy() v_str_to_datetime = np.vectorize(str_to_datetime) v_datetime_to_str = np.vectorize(datetime_to_str) for col in input_cols: locale.setlocale(locale.LC_ALL, in_language) try: tmp = v_str_to_datetime(table[col], _in_format) except: raise BrighticsFunctionException.from_errors([{ '0100': col + ' does not follow ' + _in_format + ' format.' }]) locale.setlocale(locale.LC_ALL, out_language) if display_mode == 'replace': out_table[col] = v_datetime_to_str(tmp, _out_format) else: out_table['reformat_' + col] = v_datetime_to_str(tmp, _out_format) return {'out_table': out_table}
def kernel_density_estimation(table, group_by=None, **params): check_required_parameters(_kernel_density_estimation, params, ['table']) params = get_default_from_parameters_if_required( params, _kernel_density_estimation) param_validation_check = [greater_than(params, 0, 'bandwidth')] validate(*param_validation_check) try: points = [np.float64(params['points'])] except: try: points_str = params['points'].split(',') points = [np.float64(point) for point in points_str] except: try: p0 = params['points'].split(' to ') _from = np.float64(p0[0]) p1 = p0[1].split(' by ') _to = np.float64(p1[0]) _step = np.float64(p1[1]) points = np.arange(_from, _to, _step) except: raise BrighticsFunctionException.from_errors([{ '0100': 'Points is not of Array[Double] type.' }]) params['points'] = points if group_by is not None: grouped_model = _function_by_group(_kernel_density_estimation, table, group_by=group_by, **params) return grouped_model else: return _kernel_density_estimation(table, **params)
def _shift_datetime(table, input_cols, interval, shift_unit): out_table = table.copy() if shift_unit == 'year': time_leap = pd.DateOffset(years=interval) elif shift_unit == 'month': time_leap = pd.DateOffset(months=interval) elif shift_unit == 'day': time_leap = pd.DateOffset(days=interval) elif shift_unit == 'hour': time_leap = pd.DateOffset(hours=interval) elif shift_unit == 'minute': time_leap = pd.DateOffset(minutes=interval) elif shift_unit == 'second': time_leap = pd.DateOffset(seconds=interval) for col in input_cols: out_columns = [] for ind, t_str in enumerate(table[col]): try: current_date = datetime(year=int(t_str[0:4]), month=int(t_str[4:6]), day=int(t_str[6:8]), hour=int(t_str[8:10]), minute=int(t_str[10:12]), second=int(t_str[12:14])) except: raise BFE.from_errors([{ '0100': 'Invalid Datetime format at column {}, index {}.'.format( col, ind + 1) }]) next_time = current_date + time_leap tmp_string = format_time(next_time) out_columns.append(tmp_string) out_table[col + '_timeshift_result'] = out_columns return {'out_table': out_table}
def check_required_parameters(func, params, excluded_paramkeys=[]): required_params = get_required_parameters(func) required_params_error = [] for rp in required_params: if rp not in params and rp not in excluded_paramkeys: required_params_error.append({'0033': [rp]}) if required_params_error: raise BrighticsFunctionException.from_errors(required_params_error)
def _timeseries_distance(table, input_col_1, input_col_2, distance_type, alphabet=26, hold_cols=[]): temp_table = table.copy() if len(hold_cols) > 0: out_table = temp_table[hold_cols] else: out_table = pd.DataFrame() if table[input_col_1].dtype != table[input_col_2].dtype: raise BFE.from_errors([{ '0100': 'Data types of two input timeseries must be the same.' }]) if distance_type == 'Sax': if alphabet < 3 or alphabet > 26: raise BFE.from_errors([{ '0100': 'Alphabet must be between 3 and 26 if distance_type is Sax.' }]) if not isinstance(table[input_col_1].loc[0], str): raise BFE.from_errors([{ '0100': 'Data types of input timeseries must be String if distance_type is Sax.' }]) sax_obj = SAX(alphabetSize=alphabet) else: sax_obj = None if isinstance(table[input_col_1].loc[0], str): raise BFE.from_errors([{ '0100': 'Data types of input timeseries must be Array (Double) if distance_type is NOT Sax.' }]) func = lambda x: ast.literal_eval(x) try: temp_table[input_col_1] = temp_table[input_col_1].apply(func) temp_table[input_col_2] = temp_table[input_col_2].apply(func) except: pass arr_1 = temp_table[input_col_1].values arr_2 = temp_table[input_col_2].values distance_list = compute_distance(arr_1, arr_2, distance_type, sax_obj) out_table['distance'] = distance_list return {'out_table': out_table}
def validate(*bfe): elist = [] for e in bfe: if e is not None and type(e) is tuple and len(e) == 2: elist.append({e[0]: e[1]}) if len(elist) > 0: print(elist) raise BrighticsFunctionException.from_errors(elist)
def read_csv(path, engine='python', delimiter=',', na_filter=False, strip_col=False, quoting=3, encoding='utf-8'): if quoting == True: quoting = 0 elif quoting == False: quoting = 3 dir_data = os.getcwd() + '/data' path = os.path.join(dir_data, path) res = pd.DataFrame() if os.path.isfile(path): res = table_reader.read_csv(path, engine, delimiter, na_filter, strip_col, quoting, encoding) elif os.path.isdir(path): for f in os.listdir(path): if f.endswith('.csv'): f_path = os.path.join(path, f) try: tmp = table_reader.read_csv(f_path, engine, delimiter, na_filter, strip_col, quoting, encoding) except: raise BFE.from_errors([{ '0100': 'Can not read ' + f_path + '.' }]) if res.shape != (0, 0) and not res.columns.equals(tmp.columns): raise BFE.from_errors([{ '0100': 'Files under ' + path + ' do not have same schema.' }]) res = pd.concat([res, tmp]) else: raise BFE.from_errors([{'0100': 'Path ' + path + ' is incorrect.'}]) for i, col in enumerate(res.columns): res = res.rename( columns={'Unnamed: {i}'.format(i=i): 'Unnamed_{i}'.format(i=i)}) return {'table': res}
def check_required_parameters(func, params, excluded_param_keys=None): if excluded_param_keys is None: excluded_param_keys = [] required_params = get_required_parameters(func) params_to_check = [param for param in required_params if param not in excluded_param_keys] for rp in params_to_check: if (rp not in params) or is_empty(params[rp]): raise BrighticsFunctionException.from_errors([{'0033': [rp]}])
def lda(table, group_by=None, **params): check_required_parameters(_lda, params, ['table']) params = get_default_from_parameters_if_required(params, _lda) if (params['solver'] == 'svd'): if (params['shrinkage'] == 'float'): param_validation_check = [ greater_than_or_equal_to(params, 0, 'tol'), greater_than_or_equal_to(params, 1, 'n_components'), greater_than_or_equal_to(params, 0, 'shrinkage_value'), less_than_or_equal_to(params, 1, 'shrinkage_value') ] else: param_validation_check = [ greater_than_or_equal_to(params, 0, 'tol'), greater_than_or_equal_to(params, 1, 'n_components') ] else: if (params['shrinkage'] == 'float'): param_validation_check = [ greater_than_or_equal_to(params, 1, 'n_components'), greater_than_or_equal_to(params, 0, 'shrinkage_value'), less_than_or_equal_to(params, 1, 'shrinkage_value') ] else: param_validation_check = [ greater_than_or_equal_to(params, 1, 'n_components') ] validate(*param_validation_check) if group_by is not None: label_col = "" for param in params: if param == "label_col": label_col = params[param] for group in group_by: if group == label_col: elist = [] elist.append({ '0100': "Group by column should be different from label column" }) print(elist) raise BrighticsFunctionException.from_errors(elist) grouped_model = _function_by_group(_lda, table, group_by=group_by, **params) return grouped_model else: return _lda(table, **params)
def _tukeys_range_test(table, response_cols, factor_col, alpha=0.05): if alpha < 0.001 or alpha >= 0.9: raise BrighticsFunctionException("0006", ['alpha', 0.001, 0.9]) rb = BrtcReprBuilder() rb.addMD("""## Tukey's range test Result""") for response_col in response_cols: data = table[response_col] posthoc = pairwise_tukeyhsd(data, table[factor_col], alpha=alpha) posthoc_html = posthoc._results_table.as_html() posthoc.plot_simultaneous() rb.addMD("""### {response_col}""".format(response_col=response_col)) rb.addHTML(posthoc_html) rb.addPlt(plt) plt.clf() return {'result': {'_repr_brtc_': rb.get()}}
def _isotonic_regression_train(table, feature_col, label_col, increasing=True): if feature_col == label_col: raise BFE.from_errors([{ '0100': '{} is deplicate in Feature column and Label column'.format( feature_col) }]) features = table[feature_col] label = table[label_col] isotonic_model = IsotonicRegression(increasing=increasing) isotonic_model.fit(features, label) predict = isotonic_model.predict(features) plt.figure() plt.plot(label, 'r.-') plt.plot(predict, 'b.-') plt.xlabel('Samples') plt.legend(['True label', 'Predicted']) fig_actual_predict = plt2MD(plt) get_param = isotonic_model.get_params() rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Linear Regression Result | ### Param | {param} | ### Predicted vs Actual | {image1} """.format(image1=fig_actual_predict, param=get_param))) model = _model_dict('isotonic_regression_model') model['_repr_brtc_'] = rb.get() model['feature_col'] = feature_col model['label_col'] = label_col model['parameters'] = get_param model['regressor'] = isotonic_model return {"model": model}
def _lda4(table, input_col, topic_name='topic', num_voca=1000, num_topic=5, num_topic_word=10, max_iter=20, learning_method='online', learning_offset=10., random_state=None): # generate model corpus = np.array(table[input_col]) if isinstance(corpus[0], np.ndarray): tf_vectorizer = CountVectorizer(preprocessor=' '.join, stop_words='english', max_df=0.95, min_df=2, max_features=num_voca) else: tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") log_likelihood = lda_model.score(term_count) perplexity = lda_model.perplexity(term_count) # create topic table vocab_weights_list = [] vocab_list = [] weights_list = [] topic_term_prob = normalize(lda_model.components_, norm='l1') for vector in topic_term_prob: pairs = [] for term_idx, value in enumerate(vector): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) vocab_weights = [] vocab = [] weights = [] for pair in pairs[:num_topic_word]: vocab_weights.append("{}: {}".format(pair[1], pair[0])) vocab.append(pair[1]) weights.append(pair[0]) vocab_weights_list.append(vocab_weights) vocab_list.append(vocab) weights_list.append(weights) topic_table = pd.DataFrame({ 'vocabularies_weights': vocab_weights_list, 'vocabularies': vocab_list, 'weights': weights_list }) topic_table['index'] = [idx + 1 for idx in topic_table.index] topic_table = topic_table[[ 'index', 'vocabularies_weights', 'vocabularies', 'weights' ]] # create output table doc_topic = lda_model.transform(term_count) out_table = pd.DataFrame.copy(table, deep=True) topic_dist_name = topic_name + '_distribution' if topic_name in table.columns or topic_dist_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Existing table contains Topic Column Name. Please choose again." }]) out_table[topic_name] = [ doc_topic[i].argmax() + 1 for i in range(len(corpus)) ] out_table[topic_dist_name] = doc_topic.tolist() # pyLDAvis prepared_data = ldavis.prepare(lda_model, term_count, tf_vectorizer) html_result = pyLDAvis.prepared_data_to_html(prepared_data) # generate report params = { 'Input column': input_col, 'Topic column name': topic_name, 'Number of topics': num_topic, 'Number of words for each topic': num_topic_word, 'Maximum number of iterations': max_iter, 'Learning method': learning_method, 'Learning offset': learning_offset, 'Seed': random_state } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Latent Dirichlet Allocation Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD( strip_margin(""" | | ### Log Likelihood | {log_likelihood} | | ### Perplexity | {perplexity} | | ### Parameters | {params} """.format(log_likelihood=log_likelihood, perplexity=perplexity, params=dict2MD(params)))) # create model model = _model_dict('lda_model') model['params'] = params model['lda_model'] = lda_model model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
def _lda3(table, input_col, topic_name='topic', num_voca=1000, num_topic=3, num_topic_word=3, max_iter=20, learning_method='online', learning_offset=10., random_state=None): corpus = np.array(table[input_col]) if isinstance(corpus[0], np.ndarray): tf_vectorizer = CountVectorizer(preprocessor=' '.join, stop_words='english', max_df=0.95, min_df=2, max_features=num_voca) else: tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") voca_weights_list = [] for weights in lda_model.components_: pairs = [] for term_idx, value in enumerate(weights): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) voca_weights = [] for pair in pairs[:num_topic_word]: voca_weights.append("{}: {}".format(pair[1], pair[0])) voca_weights_list.append(voca_weights) doc_topic = lda_model.transform(term_count) out_table = pd.DataFrame.copy(table, deep=True) if topic_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Existing table contains Topic Column Name. Please choose again." }]) out_table[topic_name] = [doc_topic[i].argmax() for i in range(len(corpus))] weight_list = [] for ind in out_table[topic_name]: weight_list.append(voca_weights_list[ind]) out_table['topic_vocabularies'] = weight_list return {'out_table': out_table}
def raise_runtime_error(error_message, true_condition=False): if not true_condition: raise BrighticsFunctionException('0100', [error_message])
def raise_error(error_code, error_message_params, true_condition=False): if not true_condition: raise BrighticsFunctionException(error_code, error_message_params)
def _dtm(table, input_col, topic_name='topic', num_topic=5, num_topic_word=10, max_iter=20, time_slice=None, coherence='u_mass', vis_time=0, seed=None): running_os = platform.system() is_os_64bit = platform.machine().endswith('64') if running_os == 'Linux': if is_os_64bit: dtm_filename = 'dtm-linux64' else: dtm_filename = 'dtm-linux32' elif running_os == 'Windows': if is_os_64bit: dtm_filename = 'dtm-win64.exe' else: dtm_filename = 'dtm-win32.exe' else: # Mac dtm_filename = 'dtm-darwin64' dtm_path = os.path.join(str(pathlib.Path(__file__).parent.absolute()), 'dtm', dtm_filename) if running_os != 'Windows': bash_command = "chmod +x {}".format(dtm_path) os.system(bash_command) tokenized_doc = np.array(table[input_col]) num_doc = len(tokenized_doc) if time_slice is None: time_slice = [num_doc] elif sum(time_slice) != num_doc: raise_runtime_error("The sum of time slice list does not match the number of documents.") if vis_time < 0 or vis_time >= len(time_slice): raise_runtime_error("Invalid time parameter: {}".format(vis_time)) dictionary = corpora.Dictionary(tokenized_doc) corpus = [dictionary.doc2bow(text) for text in tokenized_doc] dtm_params = {"corpus": corpus, "id2word": dictionary, "time_slices": time_slice, "num_topics": num_topic, "lda_sequence_max_iter": max_iter, "model": 'dtm'} if seed is not None: dtm_params["rng_seed"] = seed dtm_model = DtmModel(dtm_path, **dtm_params) topic_time = [[dtm_model.show_topic(topicid=id, time=t, topn=num_topic_word) for id in range(num_topic)] for t in range(len(time_slice))] topic_time = [[["{}: {}".format(tup[1], tup[0]) for tup in topic] for topic in time] for time in topic_time] timeline = ["{} ({} docs)".format(ind, t) for ind, t in enumerate(time_slice)] columns = ["topic_{}".format(i + 1) for i in range(num_topic)] topic_table = pd.DataFrame(topic_time, columns=columns) topic_table['time'] = timeline topic_table = topic_table[['time'] + columns] prop_arr = dtm_model.gamma_ out_table = pd.DataFrame.copy(table, deep=True) if topic_name in table.columns: raise BrighticsFunctionException.from_errors( [{'0100': "Existing table contains Topic Column Name. Please choose again."}]) out_table[topic_name] = [item.argmax() + 1 for item in prop_arr] out_table['topic_distribution'] = prop_arr.tolist() coherence_topic_arr = [dtm_model.dtm_coherence(time) for time in range(len(time_slice))] if coherence == 'u_mass': coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, coherence='u_mass').get_coherence() for item in coherence_topic_arr] else: coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, texts=tokenized_doc, coherence='c_v').get_coherence() for item in coherence_topic_arr] doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(corpus, vis_time) prepared_data = plv.prepare(topic_term, doc_topic, doc_lengths, vocab, term_frequency, sort_topics=False) html_result = plv.prepared_data_to_html(prepared_data) params = {'Input column': input_col, 'Topic column name': topic_name, 'Number of topics': num_topic, 'Number of words for each topic': num_topic_word, 'Maximum number of iterations': max_iter, 'Time slice': time_slice, 'Coherence measure': coherence, 'Time to visualize': vis_time} rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Dynamic Topic Modeling Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD(strip_margin(""" | ### Coherence for each period | {coh_arr} | | ### Parameters | {params} """.format(coh_arr=coh_arr, params=dict2MD(params)))) model = _model_dict('dtm_model') model['params'] = params model['dtm_model'] = dtm_model model['coherences'] = coh_arr model['corpus'] = corpus model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
def _split_sentences2(table, input_col, language='kor', doc_id_col_name='doc_id', sentence_id_col_name='sentence_id', sentence_col_name='sentence', duplicate_original=False): if doc_id_col_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Document ID column name {} already exists in the input table. Please choose another one." .format(doc_id_col_name) }]) if sentence_id_col_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Sentence ID column name {} already exists in the input table. Please choose another one." .format(sentence_id_col_name) }]) if sentence_col_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Sentence column name {} already exists in the input table. Please choose another one." .format(sentence_col_name) }]) doc_col = table[input_col].values os = platform.system() if os == 'Linux': import kss sent_tokenizer_kor = kss.split_sentences else: # os == 'Windows' from . import split_sentences_kss as kss2 sent_tokenizer_kor = kss2.kss.pykss.split_sentences sent_tokenizer_eng = tokenize.sent_tokenize if language == 'kor': sent_tokenizer = sent_tokenizer_kor elif language == 'eng': sent_tokenizer = sent_tokenizer_eng else: # language == 'mixed' def sent_tokenizer(text): kor_sents = sent_tokenizer_kor(text) sents = [sent_tokenizer_eng(sent) for sent in kor_sents] return [y for x in sents for y in x] # flattened num_doc = len(doc_col) doc_id_col = list(range(1, num_doc + 1)) sent_list_col = [sent_tokenizer(text) for text in doc_col] table[doc_id_col_name] = doc_id_col column_list = table.columns.tolist() table[sentence_col_name] = sent_list_col num_sent_col = [len(sent_list) for sent_list in sent_list_col] # to be shortened when pandas explode is available values = np.array(sent_list_col) values_flattened = np.concatenate(values).ravel() col = table[sentence_col_name] col_exploded = pd.Series(values_flattened, index=col.index.repeat(num_sent_col), name=col.name) out_table = table.drop([sentence_col_name], axis=1).join(col_exploded).reindex( columns=table.columns, copy=False) sent_id_col = sum( [list(range(1, num_sent + 1)) for num_sent in num_sent_col], []) out_table[sentence_id_col_name] = sent_id_col if not duplicate_original: column_list_original = column_list.copy() column_list_original.remove(doc_id_col_name) out_table[column_list_original] = out_table[ column_list_original].where(out_table[sentence_id_col_name] == 1, None) column_list_new = column_list + [sentence_id_col_name, sentence_col_name] out_table = out_table[column_list_new] return {'out_table': out_table}
def _gsdmm(table, input_col, topic_name='topic', K=10, alpha=0.1, beta=0.1, max_iter=50, num_topic_words=3): docs = np.array(table[input_col]) docs_set = [set(doc) for doc in docs] docs_preprocessed = [list(doc_set) for doc_set in docs_set] vocab_set = list(set.union(*docs_set)) vocab_size = len(vocab_set) # initialize and train a GSDMM model mgp = gsdmm_rwalk.MovieGroupProcess(K=K, alpha=alpha, beta=beta, n_iters=max_iter) topics = mgp.fit(docs_preprocessed, vocab_size) # generate topic table topic_word_count = mgp.cluster_word_distribution topic_words_raw = [[ind, _count_to_ratio_raw(word_count)] for ind, word_count in enumerate(topic_word_count) if word_count] topic_words = [[item[0]] + _gen_table(item[1], num_topic_words) for item in topic_words_raw] # reset topic ids nonempty_topic_indices = [item[0] for item in topic_words] reset_topic_ind = { old_ind: (new_ind + 1) for new_ind, old_ind in enumerate(nonempty_topic_indices) } topics = [reset_topic_ind[old_ind] for old_ind in topics] topic_words = [[reset_topic_ind[old_item[0]]] + old_item[1:] for old_item in topic_words] # generate output dataframes out_table = pd.DataFrame.copy(table, deep=True) if topic_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Existing table contains the topic column name. Please choose another name." }]) out_table[topic_name] = topics columns = ['index', 'vocabularies_weights', 'vocabularies', 'weights'] topic_table = pd.DataFrame(topic_words, columns=columns) topic_table['weights'] = topic_table['weights'].apply(pd.to_numeric) # pyLDAvis if len(topic_words) == 1: html_result = None else: topic_words_dicts = [item[1] for item in topic_words_raw] topic_term_dists = [[ topic_words_dict.get(word, 0) for word in vocab_set ] for topic_words_dict in topic_words_dicts] num_docs = len(topics) num_topics = len(topic_words_raw) doc_topic_dists = np.zeros((num_docs, num_topics)) for doc_id, topic_id in enumerate(topics): doc_topic_dists[doc_id][topic_id - 1] = 1.0 doc_lengths = [len(doc) for doc in docs_preprocessed] vocab_count = functools.reduce( lambda dict_1, dict_2: { word: dict_1.get(word, 0) + dict_2.get(word, 0) for word in set(dict_1).union(dict_2) }, topic_word_count) term_frequency = [vocab_count.get(word) for word in vocab_set] prepared_data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab_set, term_frequency) html_result = pyLDAvis.prepared_data_to_html(prepared_data) # generate report params = { 'Input column': input_col, 'Topic column name': topic_name, 'K': K, 'Alpha': alpha, 'Beta': beta, 'Maximum number of iterations': max_iter, 'Number of words for each topic': num_topic_words } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## GSDMM Result | ### Summary | """)) if html_result is not None: rb.addHTML(html_result) rb.addMD(strip_margin(""" | """)) rb.addMD( strip_margin(""" | ### Final Number of Topics | {num_topics} | | ### Parameters | {params} """.format(num_topics=len(topic_words_raw), params=dict2MD(params)))) # create model model = _model_dict('lda_model') model['params'] = params model['gsdmm_model'] = mgp model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
def CorrelationThreshold(X, threshold, kind): """Learn empirical variances from X. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training set to compute correlations. y : ignored Not used, present here for API consistency by convention. Returns ------- support_mask : Boolean array for feature selection """ if not (0.0 <= threshold <= 1.0): raise BFE.from_errors([{'0100': 'Threshold value must in [0.0, 1.0]'}]) if kind not in ('pearson', 'spearmanr'): raise BFE.from_errors([{'0100': "Kind must be 'pearson' or 'spearmanr"}]) if issparse(X) and kind != 'pearson': raise BFE.from_errors([{'0100': "Only pearson correlation is supported with 'sparse matrices'"}]) X = check_array(X, accept_sparse=['csc', 'csr'], dtype=[np.float64, np.float32]) n_features = X.shape[1] if threshold == 1 or (1 in X.shape): support_mask = np.ones(n_features, dtype=np.bool) return support_mask # get constant features if issparse(X): mins, maxes = min_max_axis(X, axis=0) peak_to_peaks = maxes - mins constant_mask = np.isclose(peak_to_peaks, 0.0) # sparse correlation mu, sparse_var = mean_variance_axis(X, 0) X_corr = sparse_correlation(X, mu, ~constant_mask) else: peak_to_peaks = np.ptp(X, axis=0) constant_mask = np.isclose(peak_to_peaks, 0.0) if kind == 'pearson': X_corr = np.corrcoef(X, rowvar=False) else: # spearmanr X_corr, _ = spearmanr(X) # spearmanr returns scaler when comparing two columns if isinstance(X_corr, float): X_corr = np.array([[1, X_corr], [X_corr, 1]]) np.fabs(X_corr, out=X_corr) # Removes constant features from support_mask support_mask = np.ones(n_features, dtype=np.bool) upper_idx = np.triu_indices(n_features, 1) non_constant_features = n_features for i in np.flatnonzero(constant_mask): feat_remove_mask = np.logical_and(upper_idx[0] != i, upper_idx[1] != i) upper_idx = (upper_idx[0][feat_remove_mask], upper_idx[1][feat_remove_mask]) support_mask[i] = False non_constant_features -= 1 for _ in range(non_constant_features -1): max_idx = np.argmax(X_corr[upper_idx]) feat1, feat2 = upper_idx[0][max_idx], upper_idx[1][max_idx] cur_corr = X_corr[feat1, feat2] # max correlation is lower than threshold if cur_corr < threshold: break # Temporary remove both features to calculate the mean with other # features. One of the featuers will be selected. support_mask[[feat1, feat2]] = False # if there are no other features to compare, keep the feature with the most # variance if np.all(~support_mask): if issparse(X): # sparse precalculates variance for all features var = sparse_var[[feat1, feat2]] else: var = np.var(X[:, [feat1, feat2]], axis=0) print(feat1, feat2) if var[0] < var[1]: support_mask[feat2] = True else: support_mask[feat1] = True break # mean with other features feat1_mean = np.mean(X_corr[feat1, support_mask]) feat2_mean = np.mean(X_corr[feat2, support_mask]) # feature with lower mean is kept if feat1_mean < feat2_mean: support_mask[feat1] = True feat_to_remove = feat2 else: support_mask[feat2] = True feat_to_remove = feat1 # remove the removed feature from consideration upper_idx_to_keep = np.logical_and(upper_idx[0] != feat_to_remove, upper_idx[1] != feat_to_remove) upper_idx = (upper_idx[0][upper_idx_to_keep], upper_idx[1][upper_idx_to_keep]) return support_mask
def _regex(table, input_cols, transformation_mode='extract', find_mode='all', pattern='', user_dict_pattern='', custom_pattern='', replacement_string='', user_dict=None): out_table = table.copy() pattern_dict = regex_format_dict.pattern_dict user_pattern_dict = {} if user_dict is not None: user_patterns = user_dict.values for user_pattern in user_patterns: user_pattern_name = user_pattern[0] user_pattern_content = user_pattern[1] user_pattern_dict[user_pattern_name] = user_pattern_dict.get( user_pattern_name, []) + [user_pattern_content] user_pattern_dict = { key: r'|'.join(value) for key, value in user_pattern_dict.items() } if pattern == '': raise BrighticsFunctionException.from_errors([{ '0100': "Please choose a pattern." }]) if pattern == 'custom': raw_pattern = custom_pattern elif pattern == 'user_dictionary': raw_pattern = user_pattern_dict.get(user_dict_pattern) if raw_pattern is None: raise BrighticsFunctionException.from_errors([{ '0100': user_dict_pattern + " is not a valid pattern name in the user dictionary." }]) else: raw_pattern = pattern_dict.get(pattern) regex_pattern = re.compile(raw_pattern) def transformation(text): if transformation_mode == 'extract': if find_mode == 'first': result = regex_pattern.search(text) if result is None: return "" else: return result.group() else: # find_mode == 'all' return regex_pattern.findall(text) elif transformation_mode == 'replace': if find_mode == 'first': return regex_pattern.sub(replacement_string, text, 1) else: # find_mode == 'all' return regex_pattern.sub(replacement_string, text) elif transformation_mode == 'remove': if find_mode == 'first': return regex_pattern.sub("", text, 1) else: # find_mode == 'all' return regex_pattern.sub("", text) else: # transformation_mode == 'split' if find_mode == 'first': return regex_pattern.split(text, 1) else: # find_mode == 'all' return regex_pattern.split(text) for col in input_cols: result_col = table[col].apply(transformation) out_table['regex_' + col] = result_col return {'out_table': out_table}