def test_type_detect_frame():
    # Create a DataFrame with various string columns
    df = pd.DataFrame(
        {
            "latin": ["orange", "apple", "pear"],
            "cyrillic": ["Кириллица", "гласность", "демократија"],
            "mixed": ["Кириллица", "soep", "демократија"],
            "burmese": ["ရေကြီးခြင်း", "စက်သင်ယူမှု", "ဉာဏ်ရည်တု"],
            "digits": ["01234", "121223", "12312"],
            "specials": ["$", "%^&*(", "!!!~``"],
            "whitespace": ["\t", "\n", " "],
            "jiddisch": ["רעכט צו לינקס", "שאָסיי 61", "פּיצאַ איז אָנגענעם"],
            "arabic": ["بوب ديلان", "باتي فالنتين", "السيد الدف الرجل"],
            "playing_cards": ["🂶", "🃁", "🂻"],
        }
    )

    # Initialize the typeset
    typeset = CompleteSet()

    # Infer the column type
    types = detect_type(df, typeset)
    assert types == {
        "latin": String,
        "cyrillic": String,
        "mixed": String,
        "burmese": String,
        "digits": String,
        "specials": String,
        "whitespace": String,
        "jiddisch": String,
        "arabic": String,
        "playing_cards": String,
    }
def test_type_detect_series():
    datetime_series = pd.Series(
        [
            datetime.datetime(2010, 1, 1),
            datetime.datetime(2010, 8, 2),
            datetime.datetime(2011, 2, 1),
            np.datetime64("NaT"),
        ]
    )

    typeset = StandardSet()
    detected_type = detect_type(datetime_series, typeset)
    assert detected_type == DateTime
Exemple #3
0
    "url": [
        "http://www.cwi.nl:80/%7Eguido/Python.html",
        "https://numpy.org/",
        "https://github.com/pandas-profiling/pandas-profiling",
    ],
    "uuid": [
        "0b8a22ca-80ad-4df5-85ac-fa49c44b7ede",
        "aaa381d6-8442-4f63-88c8-7c900e9a23c6",
        "00000000-0000-0000-0000-000000000000",
    ],
})

# Choose the complete typeset, which includes URLs
typeset = CompleteSet()

# Detect the type (without casting)
print(detect_type(df, typeset))
# {'numbers_with_nan': Float, 'url': String, 'uuid': String}

# Cast the dataframe to inferred types
cast_df = cast_to_inferred(df, typeset)
print(cast_df.to_string())
#    numbers_with_nan                                                url                                  uuid
# 0                 3  (http, www.cwi.nl:80, /%7Eguido/Python.html, ,...  0b8a22ca-80ad-4df5-85ac-fa49c44b7ede
# 1                 7                        (https, numpy.org, /, , , )  aaa381d6-8442-4f63-88c8-7c900e9a23c6
# 2               NaN  (https, github.com, /pandas-profiling/pandas-p...  00000000-0000-0000-0000-000000000000

# Print the inferred types
print(infer_type(df, typeset))
# {'numbers_with_nan': Integer, 'url': URL, 'uuid': UUID}
Exemple #4
0
    "cyrillic": ["Кириллица", "гласность", "демократија"],
    "mixed": ["Кириллица", "soep", "демократија"],
    "burmese": ["ရေကြီးခြင်း", "စက်သင်ယူမှု", "ဉာဏ်ရည်တု"],
    "digits": ["01234", "121223", "123123"],
    "specials": ["$", "%^&*(", "!!!~``"],
    "whitespace": ["\t", "\n", " "],
    "jiddisch": ["רעכט צו לינקס", "שאָסיי 61", "פּיצאַ איז אָנגענעם"],
    "arabic": ["بوب ديلان", "باتي فالنتين", "السيد الدف الرجل"],
    "playing_cards": ["🂶", "🃁", "🂻"],
})

# Initialize the typeset
typeset = CompleteSet()

# Infer the column type
types = detect_type(df, typeset)

# Generate a summary
summarizer = CompleteSummary()
summary = summarizer.summarize(df, types)

print("| {h1: <15}| {h2: <17}| {h3: <84}| {h4: <25}|".format(h1="Column",
                                                             h2="Scripts",
                                                             h3="Categories",
                                                             h4="Blocks"))
print("{e:-<17}+{e:-<18}+{e:-<85}+{e:-<26}+".format(e=""))
for column, variable_summary in summary["series"].items():
    scripts = ", ".join(set(variable_summary["script_values"].values()))
    categories = ", ".join(
        set(variable_summary["category_alias_values"].values()))
    blocks = ", ".join(set(variable_summary["block_values"].values()))
Exemple #5
0
import pandas as pd

from examples.data_analysis.categorical import Category
from visions.functional import detect_type
from visions.types import Boolean, Categorical
from visions.typesets import StandardSet

ts = StandardSet()
ts -= Boolean
ts -= Categorical
ts += Category

s1 = pd.Series(["A", "B", "C"] * 1000, dtype="category")
print(s1 in Category)
print(detect_type(s1, ts))

s2 = pd.Series([True, False] * 1000)
print(s2 in Category)
print(detect_type(s2, ts))