def remove_invalid_columns(orig_cols): cols = [] for col in orig_cols: is_valid_col = True num_invalid = 0 for cell in col: cell_tokens = split_token_coarse(cell) if len(cell_tokens) > TABLE_CELL_MAX_TOKEN_NUM: is_valid_col = False break if has_invalid_tokens(cell): is_valid_col = False break ascii_token_count = sum(is_ascii(w) for w in cell_tokens) non_ascii_char_count = sum(ord(c) >= 128 and c not in ALLOWED_SPECIAL_SYMBOLS for c in cell) non_ascii_token_count = len(cell_tokens) - ascii_token_count # digit_num = sum(is_digit(w) for w in cell_tokens) # is_all_digits = digit_num == len(cell_tokens) if len(cell_tokens) > 0 and ascii_token_count == 0 or non_ascii_token_count > ascii_token_count or non_ascii_char_count >= 2: if __DEBUG__: print('invalid cell for ascii rule: ', cell) is_valid_col = False break # non_alpha_token_count = sum(not w.isalpha() for w in cell_tokens) # if non_alpha_token_count > TABLE_CELL_MAX_NON_ALPHA_TOKEN_NUM: # num_invalid += 1 is_valid_col = is_valid_col and num_invalid / len(col) < 0.4 if is_valid_col: cols.append(col) return cols
def is_valid_column_name(header_name): if has_invalid_tokens(header_name): return False header_tokens = split_token_coarse(header_name) token_num = len(header_tokens) if token_num == 0 or token_num > TABLE_HEADER_MAX_TOKEN_NUM: return False alpha_token_num = sum(w.isalpha() for w in header_tokens) if alpha_token_num < TABLE_HEADER_MIN_ALPHA_WORD_NUM: return False non_alpha_token_num = sum(not w.isalpha() for w in header_tokens) if non_alpha_token_num > TABLE_HEADER_MAX_NON_ALPHA_WORD_NUM: # print('invalid header for non-alpha tokens: ',header_name) return False return True