def test_type_inference_frame():
    # Create a DataFrame with various string columns
    df = pd.DataFrame(
        {
            "latin": ["orange", "apple", "pear"],
            "cyrillic": ["Кириллица", "гласность", "демократија"],
            "mixed": ["Кириллица", "soep", "демократија"],
            "burmese": ["ရေကြီးခြင်း", "စက်သင်ယူမှု", "ဉာဏ်ရည်တု"],
            "digits": ["1234", "121223", "12312"],
            "specials": ["$", "%^&*(", "!!!~``"],
            "whitespace": ["\t", "\n", " "],
            "jiddisch": ["רעכט צו לינקס", "שאָסיי 61", "פּיצאַ איז אָנגענעם"],
            "arabic": ["بوب ديلان", "باتي فالنتين", "السيد الدف الرجل"],
            "playing_cards": ["🂶", "🃁", "🂻"],
        }
    )

    # Initialize the typeset
    typeset = CompleteSet()

    # Infer the column type
    types = infer_type(df, typeset)
    assert types == {
        "latin": String,
        "cyrillic": String,
        "mixed": String,
        "burmese": String,
        "digits": Integer,
        "specials": String,
        "whitespace": String,
        "jiddisch": String,
        "arabic": String,
        "playing_cards": String,
    }
def test_type_inference_series():
    string_series = pd.Series(["(12.0+10.0j)", "(-4.0+6.2j)", "(8.0+2.0j)"])

    typeset = StandardSet()
    detected_type = infer_type(string_series, typeset)
    assert detected_type == Complex
Exemple #3
0
def df_get_data_types_task(df: pd.DataFrame) -> dict:
    typeset = CompleteSet()
    dtypes = infer_type(df, typeset)
    dtypes_dict = {k: str(v) for k, v in dtypes.items()}
    return dtypes_dict
Exemple #4
0
    "url": [
        "http://www.cwi.nl:80/%7Eguido/Python.html",
        "https://numpy.org/",
        "https://github.com/pandas-profiling/pandas-profiling",
    ],
    "uuid": [
        "0b8a22ca-80ad-4df5-85ac-fa49c44b7ede",
        "aaa381d6-8442-4f63-88c8-7c900e9a23c6",
        "00000000-0000-0000-0000-000000000000",
    ],
})

# Choose the complete typeset, which includes URLs
typeset = CompleteSet()

# Detect the type (without casting)
print(detect_type(df, typeset))
# {'numbers_with_nan': Float, 'url': String, 'uuid': String}

# Cast the dataframe to inferred types
cast_df = cast_to_inferred(df, typeset)
print(cast_df.to_string())
#    numbers_with_nan                                                url                                  uuid
# 0                 3  (http, www.cwi.nl:80, /%7Eguido/Python.html, ,...  0b8a22ca-80ad-4df5-85ac-fa49c44b7ede
# 1                 7                        (https, numpy.org, /, , , )  aaa381d6-8442-4f63-88c8-7c900e9a23c6
# 2               NaN  (https, github.com, /pandas-profiling/pandas-p...  00000000-0000-0000-0000-000000000000

# Print the inferred types
print(infer_type(df, typeset))
# {'numbers_with_nan': Integer, 'url': URL, 'uuid': UUID}