def compute_lang(column, feature): """ compute which language(s) it use for a given series (column); store the result into (feature). not apply for numbers PROBLEMS: 1. not accurate when string contains digits 2. not accurate when string is too short maybe need to consider the special cases for the above conditions """ column = column.dropna() # ignore all missing value if (column.size == 0): # if the column is empty, do nothing return feature["natural_language_of_feature"] = list() language_count = {} for cell in column: if cell.isdigit() or HelperFunction.is_Decimal_Number(cell): continue else: #detecting language try: language = detect(cell) if language in language_count: language_count[language] += 1 else: language_count[language] = 1 except Exception as e: print( "there is something may not be any language nor number: {}" .format(cell)) pass languages_ordered = sorted(language_count, key=language_count.get, reverse=True) for lang in languages_ordered: lang_obj = {} lang_obj['name'] = lang lang_obj['count'] = language_count[lang] feature["natural_language_of_feature"].append(lang_obj)
def compute_punctuation(column, feature, weight_outlier): """ compute the statistical values related to punctuations, for details, see the format section of README. not apply for numbers (eg: for number 1.23, "." does not count as a punctuation) weight_outlier: = number_of_sigma in function "helper_outlier_calcu" """ column = column.dropna() # get rid of all missing value if (column.size == 0): # if the column is empty, do nothing return number_of_chars = sum(column.apply(len)) # number of all chars in column num_chars_cell = np.zeros(column.size) # number of chars for each cell puncs_cell = np.zeros( [column.size, len(string.punctuation)], dtype=int) # (number_of_cell * number_of_puncs) sized array # step 1: pre-calculations cell_id = -1 for cell in column: cell_id += 1 num_chars_cell[cell_id] = len(cell) # only counts puncs for non-number cell if cell.isdigit() or HelperFunction.is_Decimal_Number(cell): continue else: counts_cell_punc = np.asarray( list(cell.count(c) for c in string.punctuation)) puncs_cell[cell_id] = counts_cell_punc counts_column_punc = puncs_cell.sum( axis=0) # number of possible puncs in this column cell_density_array = puncs_cell / num_chars_cell.reshape([column.size, 1]) puncs_density_average = cell_density_array.sum(axis=0) / column.size # step 2: extract from pre-calculated data # only create this feature when punctuations exist if (sum(counts_column_punc) > 0): feature["most_common_punctuations"] = list() # list of dict # extract the counts to feature, for each punctuation for i in range(len(string.punctuation)): if (counts_column_punc[i] == 0 ): # if no this punctuation occur in the whole column, ignore continue else: punc_obj = {} punc_obj["punctuation"] = string.punctuation[i] punc_obj["count"] = counts_column_punc[i] punc_obj["ratio"] = counts_column_punc[i] / float( number_of_chars) punc_obj["punctuation_density_aggregate"] = { "mean": puncs_density_average[i] } # calculate outlier outlier_array = helper_outlier_calcu(cell_density_array[:, i], weight_outlier) # only one element in outlier punc_obj["punctuation_density_outliers"] = [{ "n": weight_outlier, "count": sum(outlier_array) }] feature["most_common_punctuations"].append(punc_obj) # step 3: sort feature["most_common_punctuations"] = sorted( feature["most_common_punctuations"], key=lambda k: k['count'], reverse=True)