コード例 #1
0
def remove_invalid_columns(orig_cols):
    cols = []
    for col in orig_cols:
        is_valid_col = True
        num_invalid = 0
        for cell in col:
            cell_tokens = split_token_coarse(cell)
            if len(cell_tokens) > TABLE_CELL_MAX_TOKEN_NUM:
                is_valid_col = False
                break

            if has_invalid_tokens(cell):
                is_valid_col = False
                break

            ascii_token_count = sum(is_ascii(w) for w in cell_tokens)
            non_ascii_char_count = sum(ord(c) >= 128 and c not in ALLOWED_SPECIAL_SYMBOLS for c in cell)
            non_ascii_token_count = len(cell_tokens) - ascii_token_count
            # digit_num = sum(is_digit(w) for w in cell_tokens)
            # is_all_digits = digit_num == len(cell_tokens)
            if len(cell_tokens) > 0 and ascii_token_count == 0 or non_ascii_token_count > ascii_token_count or non_ascii_char_count >= 2:
                if __DEBUG__:
                    print('invalid cell for ascii rule: ', cell)
                is_valid_col = False
                break

            # non_alpha_token_count = sum(not w.isalpha() for w in cell_tokens)
            # if non_alpha_token_count > TABLE_CELL_MAX_NON_ALPHA_TOKEN_NUM:
            #     num_invalid += 1

        is_valid_col = is_valid_col and num_invalid / len(col) < 0.4
        if is_valid_col:
            cols.append(col)

    return cols
コード例 #2
0
def is_valid_column_name(header_name):
    if has_invalid_tokens(header_name):
        return False

    header_tokens = split_token_coarse(header_name)
    token_num = len(header_tokens)
    if token_num == 0 or token_num > TABLE_HEADER_MAX_TOKEN_NUM:
        return False

    alpha_token_num = sum(w.isalpha() for w in header_tokens)
    if alpha_token_num < TABLE_HEADER_MIN_ALPHA_WORD_NUM:
        return False

    non_alpha_token_num = sum(not w.isalpha() for w in header_tokens)
    if non_alpha_token_num > TABLE_HEADER_MAX_NON_ALPHA_WORD_NUM:
        # print('invalid header for non-alpha tokens: ',header_name)
        return False

    return True