예제 #1
0
def svm_classification_train(table, group_by=None, **params):
    check_required_parameters(_svm_classification_train, params,
                              ['table', 'gamma_val'])
    params = get_default_from_parameters_if_required(
        params, _svm_classification_train)

    if params['gamma'] == 'other':
        if 'gamma_val' not in params:
            raise BFE.from_errors([{
                '0100':
                'Gamma value is mandatory when gamma is other'
            }])
        if params['gamma_val'] <= 0:
            raise BFE.from_errors([{
                '0100': 'Gamma value must be greater than 0'
            }])
    else:
        params['gamma_val'] = None

    param_validation_check = [
        over_to(params, 0.0, 1.0, 'c'),
        greater_than_or_equal_to(params, 0, 'degree'),
        greater_than(params, 0.0, 'tol'),
        greater_than_or_equal_to_or_equal_to(params, 1, -1, 'max_iter')
    ]
    validate(*param_validation_check)

    if group_by is not None:
        grouped_model = _function_by_group(_svm_classification_train,
                                           table,
                                           group_by=group_by,
                                           **params)
        return grouped_model
    else:
        return _svm_classification_train(table, **params)
예제 #2
0
def _extend_datetime(table, input_col, impute_unit):
    arr_order = []
    datetime_list = []
    for ind, t_str in enumerate(table[input_col]):
        try:
            if impute_unit == 'year':
                arr_order.append(datetime(year=int(t_str[0:4]), month=1,
                                          day=1))
            elif impute_unit == 'month':
                arr_order.append(
                    datetime(year=int(t_str[0:4]),
                             month=int(t_str[4:6]),
                             day=1))
            elif impute_unit == 'day':
                arr_order.append(
                    datetime(year=int(t_str[0:4]),
                             month=int(t_str[4:6]),
                             day=int(t_str[6:8])))
            elif impute_unit == 'hour':
                arr_order.append(
                    datetime(year=int(t_str[0:4]),
                             month=int(t_str[4:6]),
                             day=int(t_str[6:8]),
                             hour=int(t_str[8:10])))
            elif impute_unit == 'minute':
                arr_order.append(
                    datetime(year=int(t_str[0:4]),
                             month=int(t_str[4:6]),
                             day=int(t_str[6:8]),
                             hour=int(t_str[8:10]),
                             minute=int(t_str[10:12])))
            datetime_list.append(
                datetime(year=int(t_str[0:4]),
                         month=int(t_str[4:6]),
                         day=int(t_str[6:8]),
                         hour=int(t_str[8:10]),
                         minute=int(t_str[10:12]),
                         second=int(t_str[12:14])))
        except:
            raise BFE.from_errors([{
                '0100':
                'Invalid Datetime format at column {}, index {}.'.format(
                    input_col, ind + 1)
            }])
    # check for ascending order
    # If not -> log message error.
    tmp = check_ascending(arr_order)
    if not tmp[0]:
        log_message = 'Date time coulumn should be in strictly ascending order with the unit {}. '.format(
            impute_unit)
        log_message += 'The following is the first five invalid data: {}'.format(
            table[input_col][tmp[1]:tmp[1] + 5].tolist())
        raise BFE.from_errors([{'0100': log_message}])
    out_table = insert_datetime(table.copy(), input_col, arr_order,
                                datetime_list, impute_unit)
    return {'out_table': out_table}
예제 #3
0
파일: search.py 프로젝트: steelblu/studio
def _search(table,
            user_dict=pd.DataFrame(),
            input_cols=[],
            search_words=[],
            synonym_dict=[],
            main_operator='and'):

    if len(search_words) == 0:
        raise BrighticsFunctionException('0033', 'Search Words')

    for search_word in search_words:
        if search_word is None:
            raise BrighticsFunctionException('0033', 'Search Words')

    _table = table.copy()

    filter_list = []
    if len(input_cols) == 0:
        validate(require_param('input_cols'))
    for _list in product(input_cols, search_words):
        c, od = _list
        filter_list.append([c, od.strip('\'')])
    _out_table = _table

    filtered_set = set(_out_table.index)

    cond = np.full(len(_table), True).tolist()
    for _filter in filter_list:
        cond = (cond) & (_table[_filter[0]].str.contains(_filter[1]))
    _out_table = _table.loc[list(
        filtered_set.intersection(set(_table[cond].index)))]

    if len(user_dict.index) != 0:
        filter_list = []
        search_words = [
            user_dict['value'][i] for i, key in enumerate(user_dict['key'])
            if key in search_words
        ]
        print(search_words)
        for _list in product(input_cols, search_words):
            c, od = _list
            filter_list.append([c, od.strip('\'')])

        filtered_set = set()

        syno_cond = np.full(len(_table), False).tolist()
        for _filter in filter_list:
            syno_cond = (syno_cond) | (_table[_filter[0]].str.contains(
                _filter[1]))

        syno_cond = syno_cond | cond
        _out_table = _table.loc[list(
            filtered_set.union(set(_table[syno_cond].index)))]

    return {'out_table': _out_table}
예제 #4
0
def _datetime_formatter(table,
                        input_cols,
                        display_mode='replace',
                        in_format="%Y%m%d%H%M%S",
                        out_format="%Y-%m-%d %H:%M:%S",
                        in_language="en_US",
                        out_language="en_US"):
    import platform
    if platform.system()[:3].lower() == 'win':
        in_language = linux_window_change(in_language)
        out_language = linux_window_change(out_language)
    _in_format = format_dict[in_format]
    _out_format = format_dict[out_format]
    out_table = table.copy()
    v_str_to_datetime = np.vectorize(str_to_datetime)
    v_datetime_to_str = np.vectorize(datetime_to_str)
    for col in input_cols:
        locale.setlocale(locale.LC_ALL, in_language)
        try:
            tmp = v_str_to_datetime(table[col], _in_format)
        except:
            raise BrighticsFunctionException.from_errors([{
                '0100':
                col + ' does not follow ' + _in_format + ' format.'
            }])
        locale.setlocale(locale.LC_ALL, out_language)
        if display_mode == 'replace':
            out_table[col] = v_datetime_to_str(tmp, _out_format)
        else:
            out_table['reformat_' + col] = v_datetime_to_str(tmp, _out_format)

    return {'out_table': out_table}
예제 #5
0
def kernel_density_estimation(table, group_by=None, **params):
    check_required_parameters(_kernel_density_estimation, params, ['table'])
    params = get_default_from_parameters_if_required(
        params, _kernel_density_estimation)
    param_validation_check = [greater_than(params, 0, 'bandwidth')]
    validate(*param_validation_check)
    try:
        points = [np.float64(params['points'])]
    except:
        try:
            points_str = params['points'].split(',')
            points = [np.float64(point) for point in points_str]
        except:
            try:
                p0 = params['points'].split(' to ')
                _from = np.float64(p0[0])
                p1 = p0[1].split(' by ')
                _to = np.float64(p1[0])
                _step = np.float64(p1[1])
                points = np.arange(_from, _to, _step)
            except:
                raise BrighticsFunctionException.from_errors([{
                    '0100':
                    'Points is not of Array[Double] type.'
                }])
    params['points'] = points
    if group_by is not None:
        grouped_model = _function_by_group(_kernel_density_estimation,
                                           table,
                                           group_by=group_by,
                                           **params)
        return grouped_model
    else:
        return _kernel_density_estimation(table, **params)
예제 #6
0
def _shift_datetime(table, input_cols, interval, shift_unit):
    out_table = table.copy()
    if shift_unit == 'year':
        time_leap = pd.DateOffset(years=interval)
    elif shift_unit == 'month':
        time_leap = pd.DateOffset(months=interval)
    elif shift_unit == 'day':
        time_leap = pd.DateOffset(days=interval)
    elif shift_unit == 'hour':
        time_leap = pd.DateOffset(hours=interval)
    elif shift_unit == 'minute':
        time_leap = pd.DateOffset(minutes=interval)
    elif shift_unit == 'second':
        time_leap = pd.DateOffset(seconds=interval)
    for col in input_cols:
        out_columns = []
        for ind, t_str in enumerate(table[col]):
            try:
                current_date = datetime(year=int(t_str[0:4]),
                                        month=int(t_str[4:6]),
                                        day=int(t_str[6:8]),
                                        hour=int(t_str[8:10]),
                                        minute=int(t_str[10:12]),
                                        second=int(t_str[12:14]))
            except:
                raise BFE.from_errors([{
                    '0100':
                    'Invalid Datetime format at column {}, index {}.'.format(
                        col, ind + 1)
                }])
            next_time = current_date + time_leap
            tmp_string = format_time(next_time)
            out_columns.append(tmp_string)
        out_table[col + '_timeshift_result'] = out_columns
    return {'out_table': out_table}
예제 #7
0
def check_required_parameters(func, params, excluded_paramkeys=[]):
    required_params = get_required_parameters(func)
    required_params_error = []
    for rp in required_params:
        if rp not in params and rp not in excluded_paramkeys:
            required_params_error.append({'0033': [rp]})
    if required_params_error:
        raise BrighticsFunctionException.from_errors(required_params_error)
예제 #8
0
def _timeseries_distance(table,
                         input_col_1,
                         input_col_2,
                         distance_type,
                         alphabet=26,
                         hold_cols=[]):
    temp_table = table.copy()
    if len(hold_cols) > 0:
        out_table = temp_table[hold_cols]
    else:
        out_table = pd.DataFrame()
    if table[input_col_1].dtype != table[input_col_2].dtype:
        raise BFE.from_errors([{
            '0100':
            'Data types of two input timeseries must be the same.'
        }])
    if distance_type == 'Sax':
        if alphabet < 3 or alphabet > 26:
            raise BFE.from_errors([{
                '0100':
                'Alphabet must be between 3 and 26 if distance_type is Sax.'
            }])
        if not isinstance(table[input_col_1].loc[0], str):
            raise BFE.from_errors([{
                '0100':
                'Data types of input timeseries must be String if distance_type is Sax.'
            }])
        sax_obj = SAX(alphabetSize=alphabet)
    else:
        sax_obj = None
        if isinstance(table[input_col_1].loc[0], str):
            raise BFE.from_errors([{
                '0100':
                'Data types of input timeseries must be Array (Double) if distance_type is NOT Sax.'
            }])
    func = lambda x: ast.literal_eval(x)
    try:
        temp_table[input_col_1] = temp_table[input_col_1].apply(func)
        temp_table[input_col_2] = temp_table[input_col_2].apply(func)
    except:
        pass
    arr_1 = temp_table[input_col_1].values
    arr_2 = temp_table[input_col_2].values
    distance_list = compute_distance(arr_1, arr_2, distance_type, sax_obj)
    out_table['distance'] = distance_list
    return {'out_table': out_table}
예제 #9
0
파일: validation.py 프로젝트: shovsj/studio
def validate(*bfe):
    elist = []
    for e in bfe:
        if e is not None and type(e) is tuple and len(e) == 2:
            elist.append({e[0]: e[1]})
    if len(elist) > 0:
        print(elist)
        raise BrighticsFunctionException.from_errors(elist)
예제 #10
0
파일: load.py 프로젝트: yemode2k/studio
def read_csv(path,
             engine='python',
             delimiter=',',
             na_filter=False,
             strip_col=False,
             quoting=3,
             encoding='utf-8'):
    if quoting == True:
        quoting = 0
    elif quoting == False:
        quoting = 3
    dir_data = os.getcwd() + '/data'
    path = os.path.join(dir_data, path)
    res = pd.DataFrame()
    if os.path.isfile(path):
        res = table_reader.read_csv(path, engine, delimiter, na_filter,
                                    strip_col, quoting, encoding)
    elif os.path.isdir(path):
        for f in os.listdir(path):
            if f.endswith('.csv'):
                f_path = os.path.join(path, f)
                try:
                    tmp = table_reader.read_csv(f_path, engine, delimiter,
                                                na_filter, strip_col, quoting,
                                                encoding)
                except:
                    raise BFE.from_errors([{
                        '0100':
                        'Can not read ' + f_path + '.'
                    }])
                if res.shape != (0, 0) and not res.columns.equals(tmp.columns):
                    raise BFE.from_errors([{
                        '0100':
                        'Files under ' + path + ' do not have same schema.'
                    }])
                res = pd.concat([res, tmp])
    else:
        raise BFE.from_errors([{'0100': 'Path ' + path + ' is incorrect.'}])
    for i, col in enumerate(res.columns):
        res = res.rename(
            columns={'Unnamed: {i}'.format(i=i): 'Unnamed_{i}'.format(i=i)})

    return {'table': res}
예제 #11
0
def check_required_parameters(func, params, excluded_param_keys=None):
    if excluded_param_keys is None:
        excluded_param_keys = []

    required_params = get_required_parameters(func)
    params_to_check = [param for param in required_params if param not in excluded_param_keys]

    for rp in params_to_check:
        if (rp not in params) or is_empty(params[rp]):
            raise BrighticsFunctionException.from_errors([{'0033': [rp]}])
예제 #12
0
def lda(table, group_by=None, **params):
    check_required_parameters(_lda, params, ['table'])

    params = get_default_from_parameters_if_required(params, _lda)
    if (params['solver'] == 'svd'):
        if (params['shrinkage'] == 'float'):
            param_validation_check = [
                greater_than_or_equal_to(params, 0, 'tol'),
                greater_than_or_equal_to(params, 1, 'n_components'),
                greater_than_or_equal_to(params, 0, 'shrinkage_value'),
                less_than_or_equal_to(params, 1, 'shrinkage_value')
            ]
        else:
            param_validation_check = [
                greater_than_or_equal_to(params, 0, 'tol'),
                greater_than_or_equal_to(params, 1, 'n_components')
            ]
    else:
        if (params['shrinkage'] == 'float'):
            param_validation_check = [
                greater_than_or_equal_to(params, 1, 'n_components'),
                greater_than_or_equal_to(params, 0, 'shrinkage_value'),
                less_than_or_equal_to(params, 1, 'shrinkage_value')
            ]
        else:
            param_validation_check = [
                greater_than_or_equal_to(params, 1, 'n_components')
            ]
    validate(*param_validation_check)

    if group_by is not None:
        label_col = ""
        for param in params:
            if param == "label_col":
                label_col = params[param]
        for group in group_by:
            if group == label_col:
                elist = []
                elist.append({
                    '0100':
                    "Group by column should be different from label column"
                })
                print(elist)
                raise BrighticsFunctionException.from_errors(elist)
        grouped_model = _function_by_group(_lda,
                                           table,
                                           group_by=group_by,
                                           **params)
        return grouped_model
    else:
        return _lda(table, **params)
예제 #13
0
파일: anova.py 프로젝트: shovsj/studio
def _tukeys_range_test(table, response_cols, factor_col, alpha=0.05):
    if alpha < 0.001 or alpha >= 0.9:
        raise BrighticsFunctionException("0006", ['alpha', 0.001, 0.9])

    rb = BrtcReprBuilder()
    rb.addMD("""## Tukey's range test Result""")

    for response_col in response_cols:
        data = table[response_col]
        posthoc = pairwise_tukeyhsd(data, table[factor_col], alpha=alpha)
        posthoc_html = posthoc._results_table.as_html()
        posthoc.plot_simultaneous()

        rb.addMD("""### {response_col}""".format(response_col=response_col))
        rb.addHTML(posthoc_html)
        rb.addPlt(plt)
        plt.clf()

    return {'result': {'_repr_brtc_': rb.get()}}
예제 #14
0
def _isotonic_regression_train(table, feature_col, label_col, increasing=True):
    if feature_col == label_col:
        raise BFE.from_errors([{
            '0100':
            '{} is deplicate in Feature column and Label column'.format(
                feature_col)
        }])
    features = table[feature_col]
    label = table[label_col]
    isotonic_model = IsotonicRegression(increasing=increasing)
    isotonic_model.fit(features, label)
    predict = isotonic_model.predict(features)

    plt.figure()
    plt.plot(label, 'r.-')
    plt.plot(predict, 'b.-')
    plt.xlabel('Samples')
    plt.legend(['True label', 'Predicted'])
    fig_actual_predict = plt2MD(plt)
    get_param = isotonic_model.get_params()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Linear Regression Result
    | ### Param
    | {param}
    | ### Predicted vs Actual
    | {image1}
    """.format(image1=fig_actual_predict, param=get_param)))
    model = _model_dict('isotonic_regression_model')
    model['_repr_brtc_'] = rb.get()
    model['feature_col'] = feature_col
    model['label_col'] = label_col
    model['parameters'] = get_param
    model['regressor'] = isotonic_model
    return {"model": model}
예제 #15
0
def _lda4(table,
          input_col,
          topic_name='topic',
          num_voca=1000,
          num_topic=5,
          num_topic_word=10,
          max_iter=20,
          learning_method='online',
          learning_offset=10.,
          random_state=None):
    # generate model
    corpus = np.array(table[input_col])
    if isinstance(corpus[0], np.ndarray):
        tf_vectorizer = CountVectorizer(preprocessor=' '.join,
                                        stop_words='english',
                                        max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca)
    else:
        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca,
                                        stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()
    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")
    log_likelihood = lda_model.score(term_count)
    perplexity = lda_model.perplexity(term_count)

    # create topic table
    vocab_weights_list = []
    vocab_list = []
    weights_list = []
    topic_term_prob = normalize(lda_model.components_, norm='l1')
    for vector in topic_term_prob:
        pairs = []
        for term_idx, value in enumerate(vector):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        vocab_weights = []
        vocab = []
        weights = []
        for pair in pairs[:num_topic_word]:
            vocab_weights.append("{}: {}".format(pair[1], pair[0]))
            vocab.append(pair[1])
            weights.append(pair[0])
        vocab_weights_list.append(vocab_weights)
        vocab_list.append(vocab)
        weights_list.append(weights)
    topic_table = pd.DataFrame({
        'vocabularies_weights': vocab_weights_list,
        'vocabularies': vocab_list,
        'weights': weights_list
    })
    topic_table['index'] = [idx + 1 for idx in topic_table.index]
    topic_table = topic_table[[
        'index', 'vocabularies_weights', 'vocabularies', 'weights'
    ]]

    # create output table
    doc_topic = lda_model.transform(term_count)
    out_table = pd.DataFrame.copy(table, deep=True)
    topic_dist_name = topic_name + '_distribution'
    if topic_name in table.columns or topic_dist_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains Topic Column Name. Please choose again."
        }])
    out_table[topic_name] = [
        doc_topic[i].argmax() + 1 for i in range(len(corpus))
    ]
    out_table[topic_dist_name] = doc_topic.tolist()

    # pyLDAvis
    prepared_data = ldavis.prepare(lda_model, term_count, tf_vectorizer)
    html_result = pyLDAvis.prepared_data_to_html(prepared_data)

    # generate report
    params = {
        'Input column': input_col,
        'Topic column name': topic_name,
        'Number of topics': num_topic,
        'Number of words for each topic': num_topic_word,
        'Maximum number of iterations': max_iter,
        'Learning method': learning_method,
        'Learning offset': learning_offset,
        'Seed': random_state
    }
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Latent Dirichlet Allocation Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(
        strip_margin("""
    |
    | ### Log Likelihood
    | {log_likelihood}
    |
    | ### Perplexity
    | {perplexity}
    |
    | ### Parameters
    | {params}
    """.format(log_likelihood=log_likelihood,
               perplexity=perplexity,
               params=dict2MD(params))))

    # create model
    model = _model_dict('lda_model')
    model['params'] = params
    model['lda_model'] = lda_model
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
예제 #16
0
def _lda3(table,
          input_col,
          topic_name='topic',
          num_voca=1000,
          num_topic=3,
          num_topic_word=3,
          max_iter=20,
          learning_method='online',
          learning_offset=10.,
          random_state=None):
    corpus = np.array(table[input_col])
    if isinstance(corpus[0], np.ndarray):
        tf_vectorizer = CountVectorizer(preprocessor=' '.join,
                                        stop_words='english',
                                        max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca)
    else:
        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca,
                                        stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()

    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")
    voca_weights_list = []
    for weights in lda_model.components_:
        pairs = []
        for term_idx, value in enumerate(weights):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        voca_weights = []
        for pair in pairs[:num_topic_word]:
            voca_weights.append("{}: {}".format(pair[1], pair[0]))
        voca_weights_list.append(voca_weights)

    doc_topic = lda_model.transform(term_count)
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains Topic Column Name. Please choose again."
        }])
    out_table[topic_name] = [doc_topic[i].argmax() for i in range(len(corpus))]
    weight_list = []
    for ind in out_table[topic_name]:
        weight_list.append(voca_weights_list[ind])
    out_table['topic_vocabularies'] = weight_list
    return {'out_table': out_table}
예제 #17
0
파일: validation.py 프로젝트: shovsj/studio
def raise_runtime_error(error_message, true_condition=False):
    if not true_condition:
        raise BrighticsFunctionException('0100', [error_message])
예제 #18
0
파일: validation.py 프로젝트: shovsj/studio
def raise_error(error_code, error_message_params, true_condition=False):
    if not true_condition:
        raise BrighticsFunctionException(error_code, error_message_params)
예제 #19
0
def _dtm(table, input_col, topic_name='topic', num_topic=5, num_topic_word=10, max_iter=20, time_slice=None,
         coherence='u_mass', vis_time=0, seed=None):
    running_os = platform.system()
    is_os_64bit = platform.machine().endswith('64')
    if running_os == 'Linux':
        if is_os_64bit:
            dtm_filename = 'dtm-linux64'
        else:
            dtm_filename = 'dtm-linux32'
    elif running_os == 'Windows':
        if is_os_64bit:
            dtm_filename = 'dtm-win64.exe'
        else:
            dtm_filename = 'dtm-win32.exe'
    else:  # Mac
        dtm_filename = 'dtm-darwin64'
    dtm_path = os.path.join(str(pathlib.Path(__file__).parent.absolute()), 'dtm', dtm_filename)
    if running_os != 'Windows':
        bash_command = "chmod +x {}".format(dtm_path)
        os.system(bash_command)
    tokenized_doc = np.array(table[input_col])
    num_doc = len(tokenized_doc)
    if time_slice is None:
        time_slice = [num_doc]
    elif sum(time_slice) != num_doc:
        raise_runtime_error("The sum of time slice list does not match the number of documents.")
    if vis_time < 0 or vis_time >= len(time_slice):
        raise_runtime_error("Invalid time parameter: {}".format(vis_time))
    dictionary = corpora.Dictionary(tokenized_doc)
    corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
    dtm_params = {"corpus": corpus,
                  "id2word": dictionary,
                  "time_slices": time_slice,
                  "num_topics": num_topic,
                  "lda_sequence_max_iter": max_iter,
                  "model": 'dtm'}
    if seed is not None:
        dtm_params["rng_seed"] = seed
    dtm_model = DtmModel(dtm_path, **dtm_params)

    topic_time = [[dtm_model.show_topic(topicid=id, time=t, topn=num_topic_word) for id in range(num_topic)]
                  for t in range(len(time_slice))]
    topic_time = [[["{}: {}".format(tup[1], tup[0]) for tup in topic] for topic in time] for time in topic_time]
    timeline = ["{} ({} docs)".format(ind, t) for ind, t in enumerate(time_slice)]
    columns = ["topic_{}".format(i + 1) for i in range(num_topic)]
    topic_table = pd.DataFrame(topic_time, columns=columns)
    topic_table['time'] = timeline
    topic_table = topic_table[['time'] + columns]

    prop_arr = dtm_model.gamma_
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors(
            [{'0100': "Existing table contains Topic Column Name. Please choose again."}])
    out_table[topic_name] = [item.argmax() + 1 for item in prop_arr]
    out_table['topic_distribution'] = prop_arr.tolist()

    coherence_topic_arr = [dtm_model.dtm_coherence(time) for time in range(len(time_slice))]
    if coherence == 'u_mass':
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, coherence='u_mass').get_coherence()
                   for item in coherence_topic_arr]
    else:
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, texts=tokenized_doc,
                                  coherence='c_v').get_coherence() for item in coherence_topic_arr]

    doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(corpus, vis_time)
    prepared_data = plv.prepare(topic_term, doc_topic, doc_lengths, vocab, term_frequency, sort_topics=False)
    html_result = plv.prepared_data_to_html(prepared_data)

    params = {'Input column': input_col,
              'Topic column name': topic_name,
              'Number of topics': num_topic,
              'Number of words for each topic': num_topic_word,
              'Maximum number of iterations': max_iter,
              'Time slice': time_slice,
              'Coherence measure': coherence,
              'Time to visualize': vis_time}
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Dynamic Topic Modeling Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(strip_margin("""
    | ### Coherence for each period
    | {coh_arr}
    |
    | ### Parameters
    | {params}
    """.format(coh_arr=coh_arr, params=dict2MD(params))))

    model = _model_dict('dtm_model')
    model['params'] = params
    model['dtm_model'] = dtm_model
    model['coherences'] = coh_arr
    model['corpus'] = corpus
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
예제 #20
0
def _split_sentences2(table,
                      input_col,
                      language='kor',
                      doc_id_col_name='doc_id',
                      sentence_id_col_name='sentence_id',
                      sentence_col_name='sentence',
                      duplicate_original=False):
    if doc_id_col_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Document ID column name {} already exists in the input table. Please choose another one."
            .format(doc_id_col_name)
        }])
    if sentence_id_col_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Sentence ID column name {} already exists in the input table. Please choose another one."
            .format(sentence_id_col_name)
        }])
    if sentence_col_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Sentence column name {} already exists in the input table. Please choose another one."
            .format(sentence_col_name)
        }])

    doc_col = table[input_col].values
    os = platform.system()
    if os == 'Linux':
        import kss
        sent_tokenizer_kor = kss.split_sentences
    else:  # os == 'Windows'
        from . import split_sentences_kss as kss2
        sent_tokenizer_kor = kss2.kss.pykss.split_sentences
    sent_tokenizer_eng = tokenize.sent_tokenize

    if language == 'kor':
        sent_tokenizer = sent_tokenizer_kor
    elif language == 'eng':
        sent_tokenizer = sent_tokenizer_eng
    else:  # language == 'mixed'

        def sent_tokenizer(text):
            kor_sents = sent_tokenizer_kor(text)
            sents = [sent_tokenizer_eng(sent) for sent in kor_sents]
            return [y for x in sents for y in x]  # flattened

    num_doc = len(doc_col)
    doc_id_col = list(range(1, num_doc + 1))
    sent_list_col = [sent_tokenizer(text) for text in doc_col]
    table[doc_id_col_name] = doc_id_col
    column_list = table.columns.tolist()
    table[sentence_col_name] = sent_list_col
    num_sent_col = [len(sent_list) for sent_list in sent_list_col]

    # to be shortened when pandas explode is available
    values = np.array(sent_list_col)
    values_flattened = np.concatenate(values).ravel()
    col = table[sentence_col_name]
    col_exploded = pd.Series(values_flattened,
                             index=col.index.repeat(num_sent_col),
                             name=col.name)
    out_table = table.drop([sentence_col_name],
                           axis=1).join(col_exploded).reindex(
                               columns=table.columns, copy=False)

    sent_id_col = sum(
        [list(range(1, num_sent + 1)) for num_sent in num_sent_col], [])
    out_table[sentence_id_col_name] = sent_id_col

    if not duplicate_original:
        column_list_original = column_list.copy()
        column_list_original.remove(doc_id_col_name)
        out_table[column_list_original] = out_table[
            column_list_original].where(out_table[sentence_id_col_name] == 1,
                                        None)

    column_list_new = column_list + [sentence_id_col_name, sentence_col_name]
    out_table = out_table[column_list_new]

    return {'out_table': out_table}
예제 #21
0
파일: gsdmm.py 프로젝트: yemode2k/studio
def _gsdmm(table,
           input_col,
           topic_name='topic',
           K=10,
           alpha=0.1,
           beta=0.1,
           max_iter=50,
           num_topic_words=3):
    docs = np.array(table[input_col])
    docs_set = [set(doc) for doc in docs]
    docs_preprocessed = [list(doc_set) for doc_set in docs_set]
    vocab_set = list(set.union(*docs_set))
    vocab_size = len(vocab_set)

    # initialize and train a GSDMM model
    mgp = gsdmm_rwalk.MovieGroupProcess(K=K,
                                        alpha=alpha,
                                        beta=beta,
                                        n_iters=max_iter)
    topics = mgp.fit(docs_preprocessed, vocab_size)

    # generate topic table
    topic_word_count = mgp.cluster_word_distribution
    topic_words_raw = [[ind, _count_to_ratio_raw(word_count)]
                       for ind, word_count in enumerate(topic_word_count)
                       if word_count]
    topic_words = [[item[0]] + _gen_table(item[1], num_topic_words)
                   for item in topic_words_raw]

    # reset topic ids
    nonempty_topic_indices = [item[0] for item in topic_words]
    reset_topic_ind = {
        old_ind: (new_ind + 1)
        for new_ind, old_ind in enumerate(nonempty_topic_indices)
    }
    topics = [reset_topic_ind[old_ind] for old_ind in topics]
    topic_words = [[reset_topic_ind[old_item[0]]] + old_item[1:]
                   for old_item in topic_words]

    # generate output dataframes
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains the topic column name. Please choose another name."
        }])
    out_table[topic_name] = topics
    columns = ['index', 'vocabularies_weights', 'vocabularies', 'weights']
    topic_table = pd.DataFrame(topic_words, columns=columns)
    topic_table['weights'] = topic_table['weights'].apply(pd.to_numeric)

    # pyLDAvis
    if len(topic_words) == 1:
        html_result = None
    else:
        topic_words_dicts = [item[1] for item in topic_words_raw]
        topic_term_dists = [[
            topic_words_dict.get(word, 0) for word in vocab_set
        ] for topic_words_dict in topic_words_dicts]
        num_docs = len(topics)
        num_topics = len(topic_words_raw)
        doc_topic_dists = np.zeros((num_docs, num_topics))
        for doc_id, topic_id in enumerate(topics):
            doc_topic_dists[doc_id][topic_id - 1] = 1.0
        doc_lengths = [len(doc) for doc in docs_preprocessed]
        vocab_count = functools.reduce(
            lambda dict_1, dict_2: {
                word: dict_1.get(word, 0) + dict_2.get(word, 0)
                for word in set(dict_1).union(dict_2)
            }, topic_word_count)
        term_frequency = [vocab_count.get(word) for word in vocab_set]

        prepared_data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists,
                                         doc_lengths, vocab_set,
                                         term_frequency)
        html_result = pyLDAvis.prepared_data_to_html(prepared_data)

    # generate report
    params = {
        'Input column': input_col,
        'Topic column name': topic_name,
        'K': K,
        'Alpha': alpha,
        'Beta': beta,
        'Maximum number of iterations': max_iter,
        'Number of words for each topic': num_topic_words
    }
    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## GSDMM Result
    | ### Summary
    |
    """))
    if html_result is not None:
        rb.addHTML(html_result)
        rb.addMD(strip_margin("""
        |
        """))
    rb.addMD(
        strip_margin("""
    | ### Final Number of Topics
    | {num_topics}
    |
    | ### Parameters
    | {params}
    """.format(num_topics=len(topic_words_raw), params=dict2MD(params))))

    # create model
    model = _model_dict('lda_model')
    model['params'] = params
    model['gsdmm_model'] = mgp
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
예제 #22
0
def CorrelationThreshold(X, threshold, kind):
    """Learn empirical variances from X.
    Parameters
    ----------
    X : {array-like, sparse matrix} of shape (n_samples, n_features)
        Training set to compute correlations.
    y : ignored
        Not used, present here for API consistency by convention.
    Returns
    -------
    support_mask : Boolean array for feature selection
    """
    
    if not (0.0 <= threshold <= 1.0):
        raise BFE.from_errors([{'0100': 'Threshold value must in [0.0, 1.0]'}])
    
    if kind not in ('pearson', 'spearmanr'):
        raise BFE.from_errors([{'0100': "Kind must be 'pearson' or 'spearmanr"}])
        
    if issparse(X) and kind != 'pearson':
        raise BFE.from_errors([{'0100': "Only pearson correlation is supported with 'sparse matrices'"}])

    X = check_array(X, accept_sparse=['csc', 'csr'], dtype=[np.float64, np.float32])
    
    n_features = X.shape[1]
    if threshold == 1 or (1 in X.shape):
        support_mask = np.ones(n_features, dtype=np.bool)
        return support_mask
    
    # get constant features
    if issparse(X):
        mins, maxes = min_max_axis(X, axis=0)
        peak_to_peaks = maxes - mins
        constant_mask = np.isclose(peak_to_peaks, 0.0)
        
        # sparse correlation
        mu, sparse_var = mean_variance_axis(X, 0)
        X_corr = sparse_correlation(X, mu, ~constant_mask)
    else:
        peak_to_peaks = np.ptp(X, axis=0)
        constant_mask = np.isclose(peak_to_peaks, 0.0)
        
        if kind == 'pearson':
            X_corr = np.corrcoef(X, rowvar=False)
        else: # spearmanr
            X_corr, _ = spearmanr(X)
            # spearmanr returns scaler when comparing two columns
            if isinstance(X_corr, float):
                X_corr = np.array([[1, X_corr], [X_corr, 1]])
    
    np.fabs(X_corr, out=X_corr)
    
    # Removes constant features from support_mask
    support_mask = np.ones(n_features, dtype=np.bool)
    upper_idx = np.triu_indices(n_features, 1)
    
    non_constant_features = n_features
    for i in np.flatnonzero(constant_mask):
        feat_remove_mask = np.logical_and(upper_idx[0] != i,
                                          upper_idx[1] != i)
        upper_idx = (upper_idx[0][feat_remove_mask],
                     upper_idx[1][feat_remove_mask])
        support_mask[i] = False
        non_constant_features -= 1
    
    for _ in range(non_constant_features -1):
        max_idx = np.argmax(X_corr[upper_idx])
        feat1, feat2 = upper_idx[0][max_idx], upper_idx[1][max_idx]
        cur_corr = X_corr[feat1, feat2]
        
        # max correlation is lower than threshold
        if cur_corr < threshold:
            break
        
        # Temporary remove both features to calculate the mean with other
        # features. One of the featuers will be selected.
        support_mask[[feat1, feat2]] = False
        
        # if there are no other features to compare, keep the feature with the most
        # variance
        if np.all(~support_mask):
            if issparse(X):
                # sparse precalculates variance for all features
                var = sparse_var[[feat1, feat2]]
            else:
                var = np.var(X[:, [feat1, feat2]], axis=0)

            print(feat1, feat2)
            if var[0] < var[1]:
                support_mask[feat2] = True
            else:
                support_mask[feat1] = True
            break
            
        # mean with other features
        feat1_mean = np.mean(X_corr[feat1, support_mask])
        feat2_mean = np.mean(X_corr[feat2, support_mask])
        
        # feature with lower mean is kept
        if feat1_mean < feat2_mean:
            support_mask[feat1] = True
            feat_to_remove = feat2
        else:
            support_mask[feat2] = True
            feat_to_remove = feat1
        
        # remove the removed feature from consideration
        upper_idx_to_keep = np.logical_and(upper_idx[0] != feat_to_remove,
                                           upper_idx[1] != feat_to_remove)
        upper_idx = (upper_idx[0][upper_idx_to_keep],
                     upper_idx[1][upper_idx_to_keep])

    return support_mask
예제 #23
0
def _regex(table,
           input_cols,
           transformation_mode='extract',
           find_mode='all',
           pattern='',
           user_dict_pattern='',
           custom_pattern='',
           replacement_string='',
           user_dict=None):
    out_table = table.copy()
    pattern_dict = regex_format_dict.pattern_dict
    user_pattern_dict = {}
    if user_dict is not None:
        user_patterns = user_dict.values
        for user_pattern in user_patterns:
            user_pattern_name = user_pattern[0]
            user_pattern_content = user_pattern[1]
            user_pattern_dict[user_pattern_name] = user_pattern_dict.get(
                user_pattern_name, []) + [user_pattern_content]
    user_pattern_dict = {
        key: r'|'.join(value)
        for key, value in user_pattern_dict.items()
    }

    if pattern == '':
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Please choose a pattern."
        }])
    if pattern == 'custom':
        raw_pattern = custom_pattern
    elif pattern == 'user_dictionary':
        raw_pattern = user_pattern_dict.get(user_dict_pattern)
        if raw_pattern is None:
            raise BrighticsFunctionException.from_errors([{
                '0100':
                user_dict_pattern +
                " is not a valid pattern name in the user dictionary."
            }])
    else:
        raw_pattern = pattern_dict.get(pattern)
    regex_pattern = re.compile(raw_pattern)

    def transformation(text):
        if transformation_mode == 'extract':
            if find_mode == 'first':
                result = regex_pattern.search(text)
                if result is None:
                    return ""
                else:
                    return result.group()
            else:  # find_mode == 'all'
                return regex_pattern.findall(text)
        elif transformation_mode == 'replace':
            if find_mode == 'first':
                return regex_pattern.sub(replacement_string, text, 1)
            else:  # find_mode == 'all'
                return regex_pattern.sub(replacement_string, text)
        elif transformation_mode == 'remove':
            if find_mode == 'first':
                return regex_pattern.sub("", text, 1)
            else:  # find_mode == 'all'
                return regex_pattern.sub("", text)
        else:  # transformation_mode == 'split'
            if find_mode == 'first':
                return regex_pattern.split(text, 1)
            else:  # find_mode == 'all'
                return regex_pattern.split(text)

    for col in input_cols:
        result_col = table[col].apply(transformation)
        out_table['regex_' + col] = result_col

    return {'out_table': out_table}