def read_names():
    names: pd.DataFrame = read_csv(
        names_input, {
            'nconst': str,
            'primaryName': str,
            'birthYear': Int32Dtype(),
            'deathYear': Int32Dtype(),
            'primaryProfession': str,
            'knownForTitles': str
        }, 'nconst')
    return names
def read_titles():
    titles: pd.DataFrame = read_csv(
        titles_input, {
            'tconst': str,
            'titleType': str,
            'primaryTitle': str,
            'originalTitle': str,
            'isAdult': str,
            'startYear': Int32Dtype(),
            'endYear': Int32Dtype(),
            'runtimeMinutes': Int32Dtype(),
            'genres': str
        }, 'tconst')
    return titles
def read_ratings():
    ratings: pd.DataFrame = read_csv(ratings_input, {
        'tconst': str,
        'averageRating': float,
        'numVotes': Int32Dtype()
    }, 'tconst')
    return ratings
Ejemplo n.º 4
0
 def integral_extension_dtypes(self):
     return ([
         "Int8",
         "Int16",
         "Int32",
         "Int64",
         Int8Dtype(),
         Int16Dtype(),
         Int32Dtype(),
         Int64Dtype(),
     ] if extension_dtypes_available else [])
def read_principals():
    principals: pd.DataFrame = read_csv(
        principals_input, {
            'tconst': str,
            'ordering': Int32Dtype(),
            'nconst': str,
            'category': str,
            'job': str,
            'characters': str
        })
    return principals
Ejemplo n.º 6
0
    def test_as_spark_type_extension_dtypes(self):
        from pandas import Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype

        type_mapper = {
            Int8Dtype(): ByteType(),
            Int16Dtype(): ShortType(),
            Int32Dtype(): IntegerType(),
            Int64Dtype(): LongType(),
        }

        for extension_dtype, spark_type in type_mapper.items():
            self.assertEqual(as_spark_type(extension_dtype), spark_type)
Ejemplo n.º 7
0
def spark_type_to_pandas_dtype(
    spark_type: types.DataType, *, use_extension_dtypes: bool = False
) -> Dtype:
    """Return the given Spark DataType to pandas dtype."""

    if use_extension_dtypes and extension_dtypes_available:
        # IntegralType
        if isinstance(spark_type, types.ByteType):
            return Int8Dtype()
        elif isinstance(spark_type, types.ShortType):
            return Int16Dtype()
        elif isinstance(spark_type, types.IntegerType):
            return Int32Dtype()
        elif isinstance(spark_type, types.LongType):
            return Int64Dtype()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(spark_type, types.BooleanType):
                return BooleanDtype()
            # StringType
            elif isinstance(spark_type, types.StringType):
                return StringDtype()

        # FractionalType
        if extension_float_dtypes_available:
            if isinstance(spark_type, types.FloatType):
                return Float32Dtype()
            elif isinstance(spark_type, types.DoubleType):
                return Float64Dtype()

    if isinstance(
        spark_type,
        (
            types.DateType,
            types.NullType,
            types.ArrayType,
            types.MapType,
            types.StructType,
            types.UserDefinedType,
        ),
    ):
        return np.dtype("object")
    elif isinstance(spark_type, types.TimestampType):
        return np.dtype("datetime64[ns]")
    else:
        return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
Ejemplo n.º 8
0
def convert_dtype(schema: Dict[str, str], data: DataFrame) -> DataFrame:
    """Convert all columns in `data` to the appropriate dtype according to `schema`."""
    df = DataFrame(index=data.index)
    for column_name, dtype in schema.items():
        if column_name not in data.columns:
            continue
        elif dtype == "str":
            df[column_name] = data[column_name]
        elif dtype == "float":
            apply_func = partial(nullable_method_call, float, print_exc=False)
            df[column_name] = data[column_name].apply(apply_func).astype(float)
        elif dtype == "int":
            apply_func = partial(nullable_method_call, int, print_exc=False)
            df[column_name] = data[column_name].apply(apply_func).astype(
                Int32Dtype())
        else:
            raise TypeError(f"Unknown dtype {dtype}")
    return df
Ejemplo n.º 9
0
"""
pybbda data module

some data, and blah
"""

from pandas import Int32Dtype

from pybbda.data.sources.lahman.data import LahmanData
from pybbda.data.sources.baseball_reference.data import BaseballReferenceData
from pybbda.data.sources.retrosheet.data import RetrosheetData
from pybbda.data.sources.fangraphs.data import FangraphsData
from pybbda.data.sources.statcast.data import StatcastData

nullable_int = Int32Dtype()

__all__ = [
    "LahmanData",
    "BaseballReferenceData",
    "RetrosheetData",
    "FangraphsData",
    "StatcastData",
]