def compute_lang(column, feature):
    """
    compute which language(s) it use for a given series (column); store the result into (feature).
    not apply for numbers

    PROBLEMS:
    1. not accurate when string contains digits
    2. not accurate when string is too short
    maybe need to consider the special cases for the above conditions
    """
    column = column.dropna()  # ignore all missing value
    if (column.size == 0):  # if the column is empty, do nothing
        return

    feature["natural_language_of_feature"] = list()
    language_count = {}

    for cell in column:
        if cell.isdigit() or HelperFunction.is_Decimal_Number(cell):
            continue
        else:
            #detecting language
            try:
                language = detect(cell)
                if language in language_count:
                    language_count[language] += 1
                else:
                    language_count[language] = 1
            except Exception as e:
                print(
                    "there is something may not be any language nor number: {}"
                    .format(cell))
                pass

    languages_ordered = sorted(language_count,
                               key=language_count.get,
                               reverse=True)
    for lang in languages_ordered:
        lang_obj = {}
        lang_obj['name'] = lang
        lang_obj['count'] = language_count[lang]
        feature["natural_language_of_feature"].append(lang_obj)
def compute_punctuation(column, feature, weight_outlier):
    """
    compute the statistical values related to punctuations, for details, see the format section of README.

    not apply for numbers (eg: for number 1.23, "." does not count as a punctuation)

    weight_outlier: = number_of_sigma in function "helper_outlier_calcu"
    """

    column = column.dropna()  # get rid of all missing value
    if (column.size == 0):  # if the column is empty, do nothing
        return

    number_of_chars = sum(column.apply(len))  # number of all chars in column
    num_chars_cell = np.zeros(column.size)  # number of chars for each cell
    puncs_cell = np.zeros(
        [column.size, len(string.punctuation)],
        dtype=int)  # (number_of_cell * number_of_puncs) sized array

    # step 1: pre-calculations
    cell_id = -1
    for cell in column:
        cell_id += 1
        num_chars_cell[cell_id] = len(cell)
        # only counts puncs for non-number cell
        if cell.isdigit() or HelperFunction.is_Decimal_Number(cell):
            continue
        else:
            counts_cell_punc = np.asarray(
                list(cell.count(c) for c in string.punctuation))
            puncs_cell[cell_id] = counts_cell_punc

    counts_column_punc = puncs_cell.sum(
        axis=0)  # number of possible puncs in this column
    cell_density_array = puncs_cell / num_chars_cell.reshape([column.size, 1])
    puncs_density_average = cell_density_array.sum(axis=0) / column.size

    # step 2: extract from pre-calculated data
    # only create this feature when punctuations exist
    if (sum(counts_column_punc) > 0):
        feature["most_common_punctuations"] = list()  # list of dict

        # extract the counts to feature, for each punctuation
        for i in range(len(string.punctuation)):
            if (counts_column_punc[i] == 0
                ):  # if no this punctuation occur in the whole column, ignore
                continue
            else:
                punc_obj = {}
                punc_obj["punctuation"] = string.punctuation[i]
                punc_obj["count"] = counts_column_punc[i]
                punc_obj["ratio"] = counts_column_punc[i] / float(
                    number_of_chars)
                punc_obj["punctuation_density_aggregate"] = {
                    "mean": puncs_density_average[i]
                }
                # calculate outlier
                outlier_array = helper_outlier_calcu(cell_density_array[:, i],
                                                     weight_outlier)
                # only one element in outlier
                punc_obj["punctuation_density_outliers"] = [{
                    "n":
                    weight_outlier,
                    "count":
                    sum(outlier_array)
                }]
                feature["most_common_punctuations"].append(punc_obj)

    # step 3: sort
    feature["most_common_punctuations"] = sorted(
        feature["most_common_punctuations"],
        key=lambda k: k['count'],
        reverse=True)