コード例 #1
0
    def links(self):
        """
        The movie link table, connecting movie IDs to external identifiers.  It is indexed
        by movie ID.

        >>> mlsmall = MovieLens('data/ml-latest-small')
        >>> mlsmall.links
                 imdbId  tmdbId
        item
        1        114709     862
        2        113497    8844
        3        113228   15602
        4        114885   31357
        5        113041   11862
        ...
        [9125 rows x 2 columns]
        """

        fn = self.path / 'links.csv'
        links = pd.read_csv(fn,
                            dtype={
                                'movieId': np.int32,
                                'imdbId': np.int64,
                                'tmdbId': pd.Int64Dtype()
                            })
        links.rename(columns={'movieId': 'item'}, inplace=True)
        links.set_index('item', inplace=True)
        _log.debug('loaded %s, takes %d bytes', fn, links.memory_usage().sum())
        return links
コード例 #2
0
ファイル: test_dtypes.py プロジェクト: baskervilski/pandera
def test_pandas_extension_types():
    """Test pandas extension data type happy path."""
    # pylint: disable=no-member
    test_params = [
        (pd.CategoricalDtype(),
         pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None),
        (pd.DatetimeTZDtype(tz='UTC'),
         pd.Series(pd.date_range(start="20200101", end="20200301"),
                   dtype="datetime64[ns, utc]"), None),
        (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None),
        (pd.StringDtype(), pd.Series(["foo", "bar", "baz"],
                                     dtype="string"), None),
        (pd.PeriodDtype(freq='D'),
         pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None),
        (
            pd.SparseDtype("float"),
            pd.Series(range(100)).where(lambda s: s < 5,
                                        other=np.nan).astype("Sparse[float]"),
            {
                "nullable": True
            },
        ),
        (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None),
        (
            pd.IntervalDtype(subtype="int64"),
            pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])),
            None,
        )
    ]
    for dtype, data, series_kwargs in test_params:
        series_kwargs = {} if series_kwargs is None else series_kwargs
        series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs)
        assert isinstance(series_schema.validate(data), pd.Series)
コード例 #3
0
def test_reductions_frame_dtypes():
    df = pd.DataFrame({
        'int': [1, 2, 3, 4, 5, 6, 7, 8],
        'float': [1., 2., 3., 4., np.nan, 6., 7., 8.],
        'dt': [pd.NaT] + [datetime(2011, i, 1) for i in range(1, 8)],
        'str':
        list('abcdefgh')
    })

    if HAS_INT_NA:
        df['intna'] = pd.array([1, 2, 3, 4, None, 6, 7, 8],
                               dtype=pd.Int64Dtype())

    ddf = dd.from_pandas(df, 3)
    assert_eq(df.sum(), ddf.sum())
    assert_eq(df.prod(), ddf.prod())
    assert_eq(df.min(), ddf.min())
    assert_eq(df.max(), ddf.max())
    assert_eq(df.count(), ddf.count())
    assert_eq(df.std(), ddf.std())
    assert_eq(df.var(), ddf.var())
    assert_eq(df.sem(), ddf.sem())
    assert_eq(df.std(ddof=0), ddf.std(ddof=0))
    assert_eq(df.var(ddof=0), ddf.var(ddof=0))
    assert_eq(df.sem(ddof=0), ddf.sem(ddof=0))

    result = ddf.mean()
    expected = df.mean()
    assert_eq(expected, result)

    assert_eq(df._get_numeric_data(), ddf._get_numeric_data())

    numerics = ddf[['int', 'float']]
    assert numerics._get_numeric_data().dask == numerics.dask
コード例 #4
0
def download_rivm_data():
    verify_dataset()

    print("Connecting...")
    data = pd.read_json(URL)

    TIME = datetime.datetime.combine(DATE, datetime.time(10, 0))

    if f'{TIME}' in list(data["Date_of_report"]):
        df = data[data["Date_of_report"] == f'{TIME}'].sum()

        df = pd.DataFrame({
            'Datum': [f'{DATE}'] * 3,
            'Type': ['Totaal', 'Ziekenhuisopname', 'Overleden'],
            'Aantal':
            [df['Total_reported'], df['Hospital_admission'], df['Deceased']],
        })

        df_reported = pd.read_csv(Path("data", "rivm_NL_covid19_national.csv"))
        df_reported['Aantal'] = df_reported["Aantal"].astype(pd.Int64Dtype())

        if f'{DATE}' not in str(df_reported['Datum']):
            df_reported = df_reported.append(df, ignore_index=True)
            df_reported = df_reported.reset_index(drop=True)

            export_path = Path("data", "rivm_NL_covid19_national.csv")
            print(f"Export {export_path}")
            df_reported.to_csv(export_path, index=False)
        else:
            print('RIVM file is already up to date')
    else:
        print(f"Data for {DATE} not (yet) available")
コード例 #5
0
    def mapTaxIDs(idmapping: os.path,
                  df: pd.DataFrame,
                  onColumns: list,
                  dropUnmatched: bool = True):
        """ Maps the taxIds for each uniprotId on the given columns of the dataframe.

        Args:
            df (pd.DataFrame): dataframe to map
            onColumns (List[str]): columns of the dataframe where the uniprotids are
            dropUnmatched (bool): whether drop o not those rows that did not have a match (no taxon assigned)
        Returns:
            df: same dataframe as passed in but with the taxIDs columns added with 
                the following format <onColumn>_taxID
        """
        all_uniprotids = []
        for column in onColumns:
            all_uniprotids.extend(df[column].unique())
        logger.info("Starting taxID assignment...")
        uniproid2taxid = TaxaMapping.mapUniprot2Taxid_Uniprot(
            set(all_uniprotids), idmapping)
        for column in onColumns:
            df[f"{column}_taxID"] = df[column].apply(
                lambda x: uniproid2taxid.get(x, pd.NA)).astype(pd.Int64Dtype())
        if dropUnmatched:
            df.dropna(inplace=True)
        return df
コード例 #6
0
ファイル: test_types.py プロジェクト: nantoniou/cjworkbench
 def test_coerce_validate_nullable_int_unsupported(self):
     dataframe = pd.DataFrame({
         # We don't support nullable integer columns ... yet
         'A': pd.Series([1, np.nan], dtype=pd.Int64Dtype()),
     })
     with self.assertRaisesRegex(ValueError, 'unsupported dtype'):
         ProcessResult.coerce(dataframe)
コード例 #7
0
    def load_toi_dispositions_in_project_format(self) -> pd.DataFrame:
        """
        Loads the ExoFOP TOI table information from CSV to a data frame using a project consistent naming scheme.

        :return: The data frame of the TOI dispositions table.
        """
        columns_to_use = [
            'TIC ID', 'TFOPWG Disposition', 'Planet Num', 'Epoch (BJD)',
            'Period (days)', 'Duration (hours)', 'Sectors'
        ]
        dispositions = pd.read_csv(self.dispositions_path,
                                   usecols=columns_to_use)
        dispositions.rename(columns={
            'TFOPWG Disposition': ToiColumns.disposition.value,
            'Planet Num': ToiColumns.planet_number.value,
            'Epoch (BJD)': ToiColumns.transit_epoch__bjd.value,
            'Period (days)': ToiColumns.transit_period__days.value,
            'Duration (hours)': ToiColumns.transit_duration.value,
            'Sectors': ToiColumns.sector.value
        },
                            inplace=True)
        dispositions[ToiColumns.disposition.value] = dispositions[
            ToiColumns.disposition.value].fillna('')
        dispositions = dispositions[dispositions[
            ToiColumns.sector.value].notna()]
        dispositions[ToiColumns.sector.value] = dispositions[
            ToiColumns.sector.value].str.split(',')
        dispositions = dispositions.explode(ToiColumns.sector.value)
        dispositions[ToiColumns.sector.value] = pd.to_numeric(
            dispositions[ToiColumns.sector.value]).astype(pd.Int64Dtype())
        return dispositions
コード例 #8
0
def load_data(city, month, day):
    """
    Loads data for the specified city and filters by month and day if applicable.

    Args:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    Returns:
        df - Pandas DataFrame containing city data filtered by month and day
    """
    #Load data into data frame
    df = pd.read_csv(CITY_DATA[city])
    #change column names for easier typing and access
    newnames = list(map(lambda s: s.replace(' ', '').lower(), df.columns))
    namedict = dict(zip(df.columns, newnames))
    df.rename(columns=namedict, inplace=True)
    if 'birthyear' in df.columns:  #birthyear not available for all cities
        #Birth year is a float because is contains NAs.
        #Cast to pandas nullable integer type.
        df.birthyear = df.birthyear.astype(pd.Int64Dtype())
    #Convert the start time column to datetime
    df.starttime = pd.to_datetime(df.starttime)
    #extract month and day of the week from start time to new columns
    df['month'] = df.starttime.dt.month
    df['dayofweek'] = df.starttime.dt.weekday_name
    #filter by month if applicable
    if month != 'all':
        #index the months to get the corresponding int
        month = months.index(month) + 1
        #filter by month
        df = df[df.month == month]
    if day != 'all':
        df = df[df.dayofweek.str.match(day, case=False)]
    return df
コード例 #9
0
def main_infectious():

    df_normalized = _get_data(
        "infectious_people_count_normalized", {
            "infectious_low_normalized": "Minimum",
            "infectious_high_normalized": "Maximum",
            "infectious_avg_normalized": "Geschat aantal besmettelijke mensen",
        }, "date_of_report_unix")

    df = _get_data(
        "infectious_people_count", {
            "infectious_low": "Minimum",
            "infectious_high": "Maximum",
            "infectious_avg": "Geschat aantal besmettelijke mensen",
        }, "date_of_report_unix")
    df["Waarde"] = df["Waarde"].astype(pd.Int64Dtype())

    Path(DATA_FOLDER,
         "data-contagious/data-contagious_estimates").mkdir(exist_ok=True)

    export_date(df_normalized,
                "data-contagious",
                "RIVM_NL_contagious_estimate_normalized",
                data_date=None,
                label=None)

    export_date(df,
                "data-contagious",
                "RIVM_NL_contagious_estimate",
                data_date=None,
                label=None)
コード例 #10
0
ファイル: base.py プロジェクト: robertcv/resolwe-bio-py
    def _get_descriptors(self) -> pd.DataFrame:
        descriptors = []
        for sample in self._samples:
            sample.descriptor["sample_id"] = sample.id
            descriptors.append(sample.descriptor)

        df = pd.json_normalize(descriptors).set_index("sample_id")

        # Keep only numeric / string types:
        column_types = {}
        prefix = "XXX"
        for (schema, _,
             path) in iterate_schema(sample.descriptor,
                                     sample.descriptor_schema.schema,
                                     path=prefix):
            field_type = schema["type"]
            field_name = path[len(prefix) + 1:]

            # This can happen if this filed has None value in all descriptors
            if field_name not in df:
                continue

            if field_type == "basic:string:":
                column_types[field_name] = str
            elif field_type == "basic:integer:":
                # Pandas cannot cast NaN's to int, but it can cast them
                # to pd.Int64Dtype
                column_types[field_name] = pd.Int64Dtype()
            elif field_type == "basic:decimal:":
                column_types[field_name] = float

        df = df[column_types.keys()].astype(column_types)

        return df
コード例 #11
0
def convert_int64(df, all_numeric=False):
    """Convert all int64 (numpy) to Int64 (pandas) to better handle null values"""
    for col, dtype in df.dtypes.items():
        if dtype == 'int64' or (all_numeric and dtype == 'float64'):
            df[col] = df[col].astype(pd.Int64Dtype())

    return df
コード例 #12
0
 def importSudoku(self, sudokuName):
     sudoku = pd.read_csv(SUDOKUPATH + sudokuName.lower() + ".csv",
                          header=None,
                          dtype=pd.Int64Dtype()).fillna(0).values
     if self.checkValidity(sudoku):
         return sudoku
     return None
コード例 #13
0
    def load_toi_dispositions_in_project_format(self) -> pd.DataFrame:
        """
        Loads the ExoFOP TOI table information from CSV to a data frame using a project consistent naming scheme.

        :return:
        """
        columns_to_use = [
            'TIC ID', 'TFOPWG Disposition', 'Planet Num', 'Epoch (BJD)',
            'Period (days)', 'Duration (hours)', 'Sectors'
        ]
        dispositions = pd.read_csv(self.toi_dispositions_path,
                                   usecols=columns_to_use)
        dispositions.rename(columns={
            'TFOPWG Disposition': 'disposition',
            'Planet Num': 'planet_number',
            'Epoch (BJD)': 'transit_epoch',
            'Period (days)': 'transit_period',
            'Duration (hours)': 'transit_duration',
            'Sectors': 'Sector'
        },
                            inplace=True)
        dispositions['disposition'] = dispositions['disposition'].fillna('')
        dispositions = dispositions[dispositions['Sector'].notna()]
        dispositions['Sector'] = dispositions['Sector'].str.split(',')
        dispositions = dispositions.explode('Sector')
        dispositions['Sector'] = pd.to_numeric(dispositions['Sector']).astype(
            pd.Int64Dtype())
        return dispositions
コード例 #14
0
    def retrieve_toi_dispositions_from_exofop() -> pd.DataFrame:
        """
        Downloads and loads the ExoFOP TOI table information from CSV to a data frame using a project consistent format.
        The source for the dispositions is from the `ExoFOP TOI table
        <https://exofop.ipac.caltech.edu/tess/download_toi.php?sort=toi&output=csv>`_.

        :return: The data frame containing the dispositions.
        """
        toi_csv_url = 'https://exofop.ipac.caltech.edu/tess/download_toi.php?sort=toi&output=csv'
        columns_to_use = [
            'TIC ID', 'TFOPWG Disposition', 'Planet Num', 'Epoch (BJD)',
            'Period (days)', 'Duration (hours)', 'Sectors'
        ]
        dispositions = pd.read_csv(toi_csv_url, usecols=columns_to_use)
        dispositions['Sectors'] = dispositions['Sectors'].astype(str)
        dispositions.rename(columns={
            'Planet Num': 'Planet number',
            'Epoch (BJD)': 'Transit epoch (BJD)',
            'Period (days)': 'Transit period (days)',
            'Duration (hours)': 'Transit duration (hours)',
            'Sectors': 'Sector',
            'TFOPWG Disposition': 'TFOPWG disposition'
        },
                            inplace=True)
        dispositions['Sector'] = dispositions['Sector'].str.split(',')
        dispositions = dispositions.explode('Sector')
        dispositions['Sector'] = pd.to_numeric(dispositions['Sector'],
                                               errors='coerce').astype(
                                                   pd.Int64Dtype())
        return dispositions
コード例 #15
0
def user_stats(df):
    """Displays statistics on bikeshare users."""

    print('\nCalculating User Stats...\n')
    start_time = time.time()

   # Display counts of user types
    print(df['User Type'].value_counts())
    df['User Type'].value_counts().plot(ylabel='# of users', kind='bar')
    
    print('\n\n')

    # Display counts of gender
    if 'Gender' in(df.columns):
        print(df['Gender'].value_counts())
        df['Gender'].value_counts().plot(ylabel='# of users', kind='bar')
        
        print('\n\n')


    # Display earliest, most recent, and most common year of birth
    if 'Birth Year' in(df.columns):
        df['Birth Year'] = pd.to_numeric(df['Birth Year'] , errors='coerce').astype(pd.Int64Dtype())
        year = df['Birth Year']
        print(f'Earliest birth year is: {year.min()}\nmost recent is: {year.max()}\nand most comon birth year is: {year.mode()[0]}')
        df.groupby('Birth Year')['User Type'].count().plot(xlabel='# of users', kind = 'barh', figsize=(20,60))
        

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)
コード例 #16
0
def matches_json_to_df(matches_json):
    """Convert a matches JSON format into a Pandas data frame."""
    matches_df = pd.DataFrame([_flatten_match_json(x) for x in matches_json])
    matches_df['seriesId'] = matches_df['seriesId'].astype(pd.Int64Dtype())
    matches_df['startTimestamp'] = matches_df['startDate'].values
    matches_df['startDate'] = pd.to_datetime(matches_df.startDate, unit='ms')
    matches_df.set_index("matchId", inplace=True)
    col_names = [
        'startDate', 'league_name', 'radiant_name', 'dire_name',
        'radiantVictory', 'radiant_nicknames', 'dire_nicknames'
    ]
    for colname in [
            'radiant_nicknames', 'dire_nicknames', 'radiant_players',
            'dire_players'
    ]:
        matches_df[colname] = matches_df[colname].apply(lambda x: tuple(x))
    cols = np.concatenate(
        [col_names, matches_df.columns[~matches_df.columns.isin(col_names)]])
    matches_df = matches_df.loc[:, cols]
    matches_df.sort_values('startDate', inplace=True)

    # Strip all string columns.
    for colname in ['league_name', 'radiant_name', 'dire_name']:
        matches_df[colname] = matches_df[colname].str.strip()
    return matches_df
コード例 #17
0
def test_pandas_parser_nan_with_field_type_information_1143():
    descriptor = {
        "dialect": {
            "delimiter": ","
        },
        "name": "issue-1109",
        "path": "data/issue-1109.csv",
        "schema": {
            "fields": [
                {
                    "name": "int",
                    "type": "integer"
                },
                {
                    "name": "number",
                    "type": "number"
                },
                {
                    "name": "string",
                    "type": "string"
                },
            ]
        },
    }
    res = Resource(descriptor)
    df = res.to_pandas()
    assert all(df.dtypes.values == pd.array([pd.Int64Dtype(), float, object]))
コード例 #18
0
ファイル: test_int_null.py プロジェクト: wwjiang007/Alink
    def test_null_values(self):
        data = np.array([["dadf", 10.9, -2, False], ["c", 100.9, 1, True],
                         [None, None, None, None]])

        df = pd.DataFrame({
            "col1": data[:, 0],
            "col2": data[:, 1],
            "col3": data[:, 2],
            "col4": data[:, 3]
        })
        inOp = dataframeToOperator(
            df,
            schemaStr='col1 string, col2 double, col3 long, col4 boolean',
            op_type='batch')
        res = inOp.collectToDataframe()
        print(res.dtypes)
        print(res)
        self.assertEqual(res.dtypes[0], pd.StringDtype())
        self.assertEqual(res.dtypes[1], np.float64)
        self.assertEqual(res.dtypes[2], pd.Int64Dtype())
        self.assertEqual(res.dtypes[3], pd.BooleanDtype())
        self.assertTrue(pd.isna(res["col1"][2]))
        self.assertTrue(pd.isna(res["col2"][2]))
        self.assertTrue(pd.isna(res["col3"][2]))
        self.assertTrue(pd.isna(res["col4"][2]))
コード例 #19
0
def convert_csv_to_json_municipality(fn, export_fn):

    df = pd.read_csv(fn)

    df_wide = df \
        .pivot_table(
            index=['Datum', "Provincienaam", "Provinciecode", "Gemeentenaam", "Gemeentecode"],
            columns='Type',
            values=['Aantal', 'AantalCumulatief']
        )
    df_wide.columns = ["{}{}".format(v.lower(), k) for k, v in df_wide]
    df_wide.reset_index(inplace=True)

    # convert types
    for col in [
            "Provinciecode", "Gemeentecode", "totaalAantal",
            "ziekenhuisopnameAantal", "overledenAantal",
            "totaalAantalCumulatief", "ziekenhuisopnameAantalCumulatief",
            "overledenAantalCumulatief"
    ]:
        df_wide[col] = df_wide[col].astype(pd.Int64Dtype())

    json_str = df_wide.to_json(orient="table", indent=2, index=False)

    json_dict = json.loads(json_str)
    del json_dict["schema"]["pandas_version"]
    json_dict["apiVersion"] = "0.1"

    Path(DATA_FOLDER, "data-municipal").mkdir(exist_ok=True, parents=True)

    with open(Path(DATA_FOLDER, "data-municipal", export_fn), "w") as f:
        json.dump(json_dict, f, indent=2)
コード例 #20
0
def test_extension_types(df):
    df["c"] = pd.Series(np.arange(100.0))
    df["d"] = pd.Series(np.arange(100), dtype=pd.Int64Dtype())
    df.loc[df.index[::2], "c"] = np.nan
    df.loc[df.index[::2], "d"] = pd.NA
    res = Description(df)
    np.testing.assert_allclose(res.frame.c, res.frame.d)
コード例 #21
0
def test_convert_to_nullable_types():
    x = [i for i in range(10)]

    df = pd.DataFrame({
        "int32": pd.Series(x, dtype="int32"),
        "int64": pd.Series(x, dtype="int64"),
        "float": pd.Series(x, dtype="float64"),
        "bool": pd.Series(x, dtype=bool),
    })

    assert list(df.dtypes) == [
        dtype("int32"),
        dtype("int64"),
        dtype("float64"),
        dtype("bool"),
    ]

    df_converted = _convert_to_nullable_types(df)

    assert list(df_converted.dtypes) == [
        pd.Int32Dtype(),
        pd.Int64Dtype(),
        dtype("float64"),
        dtype("bool"),
    ]
コード例 #22
0
def get_gem():
    """Grab the GEM data from the inputs directory and do light cleaning."""
    # grab the data, d
    gem = pd.read_excel(PATH_GEM)
    gem = pudl.helpers.simplify_columns(gem).dropna(how='all')
    # lets only look at the columns we need.
    gem = (gem[[
        'plant_name', 'unit_name', 'other_ids_location', 'other_ids_unit'
    ]].add_suffix('_gem'))
    # extract the plant IDs and generator IDS from these two unit columns.
    # assign types! force the plant id into an float before a nullable int
    # hopefully pandas will let you jump straight to ints soon...
    # https://github.com/pandas-dev/pandas/issues/25472
    gem.loc[:, 'plant_id_eia'] = gem.other_ids_location_gem.apply(
        lambda x: literal_eval(x)['EIA']).astype('float').astype('Int64')
    gem.loc[:, 'generator_ids_eia'] = gem.other_ids_unit_gem.apply(
        lambda x: literal_eval(x)['EIA'])
    # set a GEM specific unit id so we can make a table with
    # plants, "units" and generator IDs
    gem = (gem.reset_index().rename(columns={
        'index': 'unit_id_gem'
    }).astype({
        'unit_id_gem': pd.Int64Dtype()
    }).drop(columns=['other_ids_location_gem', 'other_ids_unit_gem']))
    return gem
コード例 #23
0
def integer_type_mapping(
        use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]:
    if use_extension_types:
        return {
            IntegerType.INT8: pd.Int8Dtype(),
            IntegerType.UINT8: pd.UInt8Dtype(),
            IntegerType.INT16: pd.Int16Dtype(),
            IntegerType.UINT16: pd.UInt16Dtype(),
            IntegerType.INT24: pd.Int32Dtype(),
            IntegerType.UINT24: pd.Int32Dtype(),
            IntegerType.INT32: pd.Int32Dtype(),
            IntegerType.UINT32: pd.UInt32Dtype(),
            IntegerType.INT64: pd.Int64Dtype(),
            IntegerType.UINT64: pd.UInt64Dtype(),
        }
    else:
        return {
            IntegerType.INT8: np.int8,
            IntegerType.UINT8: np.uint8,
            IntegerType.INT16: np.int16,
            IntegerType.UINT16: np.uint16,
            IntegerType.INT24: np.int32,
            IntegerType.UINT24: np.uint32,
            IntegerType.INT32: np.int32,
            IntegerType.UINT32: np.uint32,
            IntegerType.INT64: np.int64,
            IntegerType.UINT64: np.uint64,
        }
コード例 #24
0
def get_nems_year(year):
    """
    Get NEMS data for a single year.

    Args:
        year (integer): four-digit year. Must be a key in NEMS_FILE_NAMES.

    Returns:
        pandas.DataFrame:
    """
    if year not in NEMS_FILE_NAMES:
        raise AssertionError(
            f"{year} not in NEMS_FILE_NAMES map. Add new year and file to "
            "NEMS_FILE_NAMES or try different year. Current years are: "
            f"{NEMS_FILE_NAMES.keys}")
    nems_df = (
        pd.read_csv(
            PATH_INPUTS / NEMS_FILE_NAMES[year],
            delimiter=":",
            names=get_nems_headers(),
            dtype={
                'Plant ID': pd.Int64Dtype(),
                'EFD Fuel Codes.1': 'string',  # this is for memory
                'EFD Fuel Codes.2': 'string',  # mixed string/int cols
            }).assign(report_year=year, report_date=f'{year}-01-01'))

    return nems_df
コード例 #25
0
def pyarrow2pandas_extension(  # pylint: disable=too-many-branches,too-many-return-statements
    dtype: pa.DataType,
) -> Optional[pd.api.extensions.ExtensionDtype]:
    """Pyarrow to Pandas data types conversion."""
    if pa.types.is_int8(dtype):
        return pd.Int8Dtype()
    if pa.types.is_int16(dtype):
        return pd.Int16Dtype()
    if pa.types.is_int32(dtype):
        return pd.Int32Dtype()
    if pa.types.is_int64(dtype):
        return pd.Int64Dtype()
    if pa.types.is_uint8(dtype):
        return pd.UInt8Dtype()
    if pa.types.is_uint16(dtype):
        return pd.UInt16Dtype()
    if pa.types.is_uint32(dtype):
        return pd.UInt32Dtype()
    if pa.types.is_uint64(dtype):
        return pd.UInt64Dtype()
    if pa.types.is_boolean(dtype):
        return pd.BooleanDtype()
    if pa.types.is_string(dtype):
        return pd.StringDtype()
    return None
コード例 #26
0
def get_nems():
    """Grab NEMS and perform basic column cleaning."""
    # first grab each year of NEMS data
    nems_dfs = []
    for year in NEMS_FILE_NAMES.keys():
        nems_dfs.append(get_nems_year(year))
    # then squish them together and apply common cleaning
    nems_df = (pd.concat(nems_dfs).rename(
        columns={
            'Plant ID': 'plant_id_eia',
            'Unit ID': 'generator_id',
            'Name Plate Capacity (shared if multiple owners) (MW)':
            'capacity_mw',
            'Average Capacity Factor': 'capacity_factor',
            'Annual Investment in Capital Additions (87$/kW)':
            'capex_annual_per_kw_87',
            'Variable O&M Cost (87$/MWH)': 'variable_om_mwh_87',
            'Fixed O&M Cost (87$/kW)': 'fixed_om_kw_87',
        }).astype({
            'report_date': 'datetime64[ns]',
            'report_year': pd.Int64Dtype(),
            'fixed_om_kw_87': 'float32',
            'variable_om_mwh_87': 'float32',
        }))
    # theres a ton of trailing zeros on the generator_id column which gotta go
    # this is the only string column we need rn, but beware if others are used
    nems_df.loc[nems_df['generator_id'].notnull(), 'generator_id'] = (
        nems_df.loc[nems_df['generator_id'].notnull(),
                    'generator_id'].astype(str).str.strip())
    return nems_df
コード例 #27
0
def create_informative_trials_column(
    df: pd.DataFrame,
    as_column: str = "Wait Index",
    group_by: List[str] = ("Subject", "Stimulus Vocalizer",
                           "Stimulus Call Type", "Stimulus Class"),
    in_place: bool = False,
):
    """Create a column showing the informative trial index of each stim relative to the start of the experiment

    By default, groups trials by Subject, Stimulus Vocalizer, Stimulus Call Type, and Stimulus Class (rewarded/nonrewarded)

    To do informative trial analysis for a given block (e.g. the informative trials since the start of the last day of testing)
    these need to be shifted by subtracting off the "Wait Index" of the first trial prior to the requested range.

    The normal procedure to do this would be to create a slice of the dataframe you would like to analyze and then
    call create_informative_trials_column() on it.
    """
    group_by = list(group_by)

    if not in_place:
        df = df.copy()

    df[as_column] = np.nan
    df[as_column] = df[as_column].astype(pd.Int64Dtype())
    # Loop over each stimulus, and count the number of non-interrupted trials that have been seen previously
    for _, sub_df in df.groupby(group_by):
        waits_seen = 0
        for idx in sub_df.index:
            df.loc[idx, as_column] = waits_seen
            if sub_df.loc[idx, "Interrupt"] == False:
                waits_seen += 1

    return df
コード例 #28
0
 def fix_colnames(df):
     '''
     Standardise column names and typing.
     '''
     cols = {
         colname: colname.casefold().replace('?', '').replace('  ', ' ')
         for colname in df.columns
     }
     df = df.rename(columns=cols)
     # This is not the most elegant solution but workable for now
     df['phone number'] = df['phone number'].astype(
         str).apply(lambda x: x.replace(' ', '')).apply(
             lambda x: f'0{x.replace(".0", "")}'
         )  #if not x.startswith('0') else x.replace(".0", ""))
     for col in [
             'age', 'number of adults',
             'how many of these adults are over 70', 'number of children',
             'how many of these children are over 12'
     ]:
         df[col] = df[col].astype(pd.Int64Dtype())
     for col in [
             'vegetarian, halal or kosher', 'do they have a cooker',
             'do they have a hob', 'do they have a kettle',
             'do they have a microwave'
     ]:  # boolean conversion
         df[col] = df[col].apply(lambda x: True
                                 if x.casefold() == 'yes' else False)
     return df
コード例 #29
0
def convert_data(df, column, type):
    if type == int:
        try:
            casted_vals = []
            for val in df[column]:
                if pd.notna(val):
                    casted_vals.append(int(val))
                else:
                    casted_vals.append(val)
            df[column] = pd.Series(casted_vals, dtype=pd.Int64Dtype())
        except:
            try:
                df = df.astype({column: float})
            except:
                pass
    elif type == float:
        try:
            df = df.astype({column: float})
        except:
            pass
    elif type == bool:
        df[column] = df[column].map(BOOLEAN_VALUES)
    else:
        return df
    return df
コード例 #30
0
def test_issue502(summarizer, typeset):
    series = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
                       dtype=pd.Int64Dtype())

    result = describe_1d(series, summarizer, typeset)
    assert result["min"] == 1
    assert result["max"] == 11