def links(self): """ The movie link table, connecting movie IDs to external identifiers. It is indexed by movie ID. >>> mlsmall = MovieLens('data/ml-latest-small') >>> mlsmall.links imdbId tmdbId item 1 114709 862 2 113497 8844 3 113228 15602 4 114885 31357 5 113041 11862 ... [9125 rows x 2 columns] """ fn = self.path / 'links.csv' links = pd.read_csv(fn, dtype={ 'movieId': np.int32, 'imdbId': np.int64, 'tmdbId': pd.Int64Dtype() }) links.rename(columns={'movieId': 'item'}, inplace=True) links.set_index('item', inplace=True) _log.debug('loaded %s, takes %d bytes', fn, links.memory_usage().sum()) return links
def test_pandas_extension_types(): """Test pandas extension data type happy path.""" # pylint: disable=no-member test_params = [ (pd.CategoricalDtype(), pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None), (pd.DatetimeTZDtype(tz='UTC'), pd.Series(pd.date_range(start="20200101", end="20200301"), dtype="datetime64[ns, utc]"), None), (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None), (pd.StringDtype(), pd.Series(["foo", "bar", "baz"], dtype="string"), None), (pd.PeriodDtype(freq='D'), pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None), ( pd.SparseDtype("float"), pd.Series(range(100)).where(lambda s: s < 5, other=np.nan).astype("Sparse[float]"), { "nullable": True }, ), (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None), ( pd.IntervalDtype(subtype="int64"), pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])), None, ) ] for dtype, data, series_kwargs in test_params: series_kwargs = {} if series_kwargs is None else series_kwargs series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs) assert isinstance(series_schema.validate(data), pd.Series)
def test_reductions_frame_dtypes(): df = pd.DataFrame({ 'int': [1, 2, 3, 4, 5, 6, 7, 8], 'float': [1., 2., 3., 4., np.nan, 6., 7., 8.], 'dt': [pd.NaT] + [datetime(2011, i, 1) for i in range(1, 8)], 'str': list('abcdefgh') }) if HAS_INT_NA: df['intna'] = pd.array([1, 2, 3, 4, None, 6, 7, 8], dtype=pd.Int64Dtype()) ddf = dd.from_pandas(df, 3) assert_eq(df.sum(), ddf.sum()) assert_eq(df.prod(), ddf.prod()) assert_eq(df.min(), ddf.min()) assert_eq(df.max(), ddf.max()) assert_eq(df.count(), ddf.count()) assert_eq(df.std(), ddf.std()) assert_eq(df.var(), ddf.var()) assert_eq(df.sem(), ddf.sem()) assert_eq(df.std(ddof=0), ddf.std(ddof=0)) assert_eq(df.var(ddof=0), ddf.var(ddof=0)) assert_eq(df.sem(ddof=0), ddf.sem(ddof=0)) result = ddf.mean() expected = df.mean() assert_eq(expected, result) assert_eq(df._get_numeric_data(), ddf._get_numeric_data()) numerics = ddf[['int', 'float']] assert numerics._get_numeric_data().dask == numerics.dask
def download_rivm_data(): verify_dataset() print("Connecting...") data = pd.read_json(URL) TIME = datetime.datetime.combine(DATE, datetime.time(10, 0)) if f'{TIME}' in list(data["Date_of_report"]): df = data[data["Date_of_report"] == f'{TIME}'].sum() df = pd.DataFrame({ 'Datum': [f'{DATE}'] * 3, 'Type': ['Totaal', 'Ziekenhuisopname', 'Overleden'], 'Aantal': [df['Total_reported'], df['Hospital_admission'], df['Deceased']], }) df_reported = pd.read_csv(Path("data", "rivm_NL_covid19_national.csv")) df_reported['Aantal'] = df_reported["Aantal"].astype(pd.Int64Dtype()) if f'{DATE}' not in str(df_reported['Datum']): df_reported = df_reported.append(df, ignore_index=True) df_reported = df_reported.reset_index(drop=True) export_path = Path("data", "rivm_NL_covid19_national.csv") print(f"Export {export_path}") df_reported.to_csv(export_path, index=False) else: print('RIVM file is already up to date') else: print(f"Data for {DATE} not (yet) available")
def mapTaxIDs(idmapping: os.path, df: pd.DataFrame, onColumns: list, dropUnmatched: bool = True): """ Maps the taxIds for each uniprotId on the given columns of the dataframe. Args: df (pd.DataFrame): dataframe to map onColumns (List[str]): columns of the dataframe where the uniprotids are dropUnmatched (bool): whether drop o not those rows that did not have a match (no taxon assigned) Returns: df: same dataframe as passed in but with the taxIDs columns added with the following format <onColumn>_taxID """ all_uniprotids = [] for column in onColumns: all_uniprotids.extend(df[column].unique()) logger.info("Starting taxID assignment...") uniproid2taxid = TaxaMapping.mapUniprot2Taxid_Uniprot( set(all_uniprotids), idmapping) for column in onColumns: df[f"{column}_taxID"] = df[column].apply( lambda x: uniproid2taxid.get(x, pd.NA)).astype(pd.Int64Dtype()) if dropUnmatched: df.dropna(inplace=True) return df
def test_coerce_validate_nullable_int_unsupported(self): dataframe = pd.DataFrame({ # We don't support nullable integer columns ... yet 'A': pd.Series([1, np.nan], dtype=pd.Int64Dtype()), }) with self.assertRaisesRegex(ValueError, 'unsupported dtype'): ProcessResult.coerce(dataframe)
def load_toi_dispositions_in_project_format(self) -> pd.DataFrame: """ Loads the ExoFOP TOI table information from CSV to a data frame using a project consistent naming scheme. :return: The data frame of the TOI dispositions table. """ columns_to_use = [ 'TIC ID', 'TFOPWG Disposition', 'Planet Num', 'Epoch (BJD)', 'Period (days)', 'Duration (hours)', 'Sectors' ] dispositions = pd.read_csv(self.dispositions_path, usecols=columns_to_use) dispositions.rename(columns={ 'TFOPWG Disposition': ToiColumns.disposition.value, 'Planet Num': ToiColumns.planet_number.value, 'Epoch (BJD)': ToiColumns.transit_epoch__bjd.value, 'Period (days)': ToiColumns.transit_period__days.value, 'Duration (hours)': ToiColumns.transit_duration.value, 'Sectors': ToiColumns.sector.value }, inplace=True) dispositions[ToiColumns.disposition.value] = dispositions[ ToiColumns.disposition.value].fillna('') dispositions = dispositions[dispositions[ ToiColumns.sector.value].notna()] dispositions[ToiColumns.sector.value] = dispositions[ ToiColumns.sector.value].str.split(',') dispositions = dispositions.explode(ToiColumns.sector.value) dispositions[ToiColumns.sector.value] = pd.to_numeric( dispositions[ToiColumns.sector.value]).astype(pd.Int64Dtype()) return dispositions
def load_data(city, month, day): """ Loads data for the specified city and filters by month and day if applicable. Args: (str) city - name of the city to analyze (str) month - name of the month to filter by, or "all" to apply no month filter (str) day - name of the day of week to filter by, or "all" to apply no day filter Returns: df - Pandas DataFrame containing city data filtered by month and day """ #Load data into data frame df = pd.read_csv(CITY_DATA[city]) #change column names for easier typing and access newnames = list(map(lambda s: s.replace(' ', '').lower(), df.columns)) namedict = dict(zip(df.columns, newnames)) df.rename(columns=namedict, inplace=True) if 'birthyear' in df.columns: #birthyear not available for all cities #Birth year is a float because is contains NAs. #Cast to pandas nullable integer type. df.birthyear = df.birthyear.astype(pd.Int64Dtype()) #Convert the start time column to datetime df.starttime = pd.to_datetime(df.starttime) #extract month and day of the week from start time to new columns df['month'] = df.starttime.dt.month df['dayofweek'] = df.starttime.dt.weekday_name #filter by month if applicable if month != 'all': #index the months to get the corresponding int month = months.index(month) + 1 #filter by month df = df[df.month == month] if day != 'all': df = df[df.dayofweek.str.match(day, case=False)] return df
def main_infectious(): df_normalized = _get_data( "infectious_people_count_normalized", { "infectious_low_normalized": "Minimum", "infectious_high_normalized": "Maximum", "infectious_avg_normalized": "Geschat aantal besmettelijke mensen", }, "date_of_report_unix") df = _get_data( "infectious_people_count", { "infectious_low": "Minimum", "infectious_high": "Maximum", "infectious_avg": "Geschat aantal besmettelijke mensen", }, "date_of_report_unix") df["Waarde"] = df["Waarde"].astype(pd.Int64Dtype()) Path(DATA_FOLDER, "data-contagious/data-contagious_estimates").mkdir(exist_ok=True) export_date(df_normalized, "data-contagious", "RIVM_NL_contagious_estimate_normalized", data_date=None, label=None) export_date(df, "data-contagious", "RIVM_NL_contagious_estimate", data_date=None, label=None)
def _get_descriptors(self) -> pd.DataFrame: descriptors = [] for sample in self._samples: sample.descriptor["sample_id"] = sample.id descriptors.append(sample.descriptor) df = pd.json_normalize(descriptors).set_index("sample_id") # Keep only numeric / string types: column_types = {} prefix = "XXX" for (schema, _, path) in iterate_schema(sample.descriptor, sample.descriptor_schema.schema, path=prefix): field_type = schema["type"] field_name = path[len(prefix) + 1:] # This can happen if this filed has None value in all descriptors if field_name not in df: continue if field_type == "basic:string:": column_types[field_name] = str elif field_type == "basic:integer:": # Pandas cannot cast NaN's to int, but it can cast them # to pd.Int64Dtype column_types[field_name] = pd.Int64Dtype() elif field_type == "basic:decimal:": column_types[field_name] = float df = df[column_types.keys()].astype(column_types) return df
def convert_int64(df, all_numeric=False): """Convert all int64 (numpy) to Int64 (pandas) to better handle null values""" for col, dtype in df.dtypes.items(): if dtype == 'int64' or (all_numeric and dtype == 'float64'): df[col] = df[col].astype(pd.Int64Dtype()) return df
def importSudoku(self, sudokuName): sudoku = pd.read_csv(SUDOKUPATH + sudokuName.lower() + ".csv", header=None, dtype=pd.Int64Dtype()).fillna(0).values if self.checkValidity(sudoku): return sudoku return None
def load_toi_dispositions_in_project_format(self) -> pd.DataFrame: """ Loads the ExoFOP TOI table information from CSV to a data frame using a project consistent naming scheme. :return: """ columns_to_use = [ 'TIC ID', 'TFOPWG Disposition', 'Planet Num', 'Epoch (BJD)', 'Period (days)', 'Duration (hours)', 'Sectors' ] dispositions = pd.read_csv(self.toi_dispositions_path, usecols=columns_to_use) dispositions.rename(columns={ 'TFOPWG Disposition': 'disposition', 'Planet Num': 'planet_number', 'Epoch (BJD)': 'transit_epoch', 'Period (days)': 'transit_period', 'Duration (hours)': 'transit_duration', 'Sectors': 'Sector' }, inplace=True) dispositions['disposition'] = dispositions['disposition'].fillna('') dispositions = dispositions[dispositions['Sector'].notna()] dispositions['Sector'] = dispositions['Sector'].str.split(',') dispositions = dispositions.explode('Sector') dispositions['Sector'] = pd.to_numeric(dispositions['Sector']).astype( pd.Int64Dtype()) return dispositions
def retrieve_toi_dispositions_from_exofop() -> pd.DataFrame: """ Downloads and loads the ExoFOP TOI table information from CSV to a data frame using a project consistent format. The source for the dispositions is from the `ExoFOP TOI table <https://exofop.ipac.caltech.edu/tess/download_toi.php?sort=toi&output=csv>`_. :return: The data frame containing the dispositions. """ toi_csv_url = 'https://exofop.ipac.caltech.edu/tess/download_toi.php?sort=toi&output=csv' columns_to_use = [ 'TIC ID', 'TFOPWG Disposition', 'Planet Num', 'Epoch (BJD)', 'Period (days)', 'Duration (hours)', 'Sectors' ] dispositions = pd.read_csv(toi_csv_url, usecols=columns_to_use) dispositions['Sectors'] = dispositions['Sectors'].astype(str) dispositions.rename(columns={ 'Planet Num': 'Planet number', 'Epoch (BJD)': 'Transit epoch (BJD)', 'Period (days)': 'Transit period (days)', 'Duration (hours)': 'Transit duration (hours)', 'Sectors': 'Sector', 'TFOPWG Disposition': 'TFOPWG disposition' }, inplace=True) dispositions['Sector'] = dispositions['Sector'].str.split(',') dispositions = dispositions.explode('Sector') dispositions['Sector'] = pd.to_numeric(dispositions['Sector'], errors='coerce').astype( pd.Int64Dtype()) return dispositions
def user_stats(df): """Displays statistics on bikeshare users.""" print('\nCalculating User Stats...\n') start_time = time.time() # Display counts of user types print(df['User Type'].value_counts()) df['User Type'].value_counts().plot(ylabel='# of users', kind='bar') print('\n\n') # Display counts of gender if 'Gender' in(df.columns): print(df['Gender'].value_counts()) df['Gender'].value_counts().plot(ylabel='# of users', kind='bar') print('\n\n') # Display earliest, most recent, and most common year of birth if 'Birth Year' in(df.columns): df['Birth Year'] = pd.to_numeric(df['Birth Year'] , errors='coerce').astype(pd.Int64Dtype()) year = df['Birth Year'] print(f'Earliest birth year is: {year.min()}\nmost recent is: {year.max()}\nand most comon birth year is: {year.mode()[0]}') df.groupby('Birth Year')['User Type'].count().plot(xlabel='# of users', kind = 'barh', figsize=(20,60)) print("\nThis took %s seconds." % (time.time() - start_time)) print('-'*40)
def matches_json_to_df(matches_json): """Convert a matches JSON format into a Pandas data frame.""" matches_df = pd.DataFrame([_flatten_match_json(x) for x in matches_json]) matches_df['seriesId'] = matches_df['seriesId'].astype(pd.Int64Dtype()) matches_df['startTimestamp'] = matches_df['startDate'].values matches_df['startDate'] = pd.to_datetime(matches_df.startDate, unit='ms') matches_df.set_index("matchId", inplace=True) col_names = [ 'startDate', 'league_name', 'radiant_name', 'dire_name', 'radiantVictory', 'radiant_nicknames', 'dire_nicknames' ] for colname in [ 'radiant_nicknames', 'dire_nicknames', 'radiant_players', 'dire_players' ]: matches_df[colname] = matches_df[colname].apply(lambda x: tuple(x)) cols = np.concatenate( [col_names, matches_df.columns[~matches_df.columns.isin(col_names)]]) matches_df = matches_df.loc[:, cols] matches_df.sort_values('startDate', inplace=True) # Strip all string columns. for colname in ['league_name', 'radiant_name', 'dire_name']: matches_df[colname] = matches_df[colname].str.strip() return matches_df
def test_pandas_parser_nan_with_field_type_information_1143(): descriptor = { "dialect": { "delimiter": "," }, "name": "issue-1109", "path": "data/issue-1109.csv", "schema": { "fields": [ { "name": "int", "type": "integer" }, { "name": "number", "type": "number" }, { "name": "string", "type": "string" }, ] }, } res = Resource(descriptor) df = res.to_pandas() assert all(df.dtypes.values == pd.array([pd.Int64Dtype(), float, object]))
def test_null_values(self): data = np.array([["dadf", 10.9, -2, False], ["c", 100.9, 1, True], [None, None, None, None]]) df = pd.DataFrame({ "col1": data[:, 0], "col2": data[:, 1], "col3": data[:, 2], "col4": data[:, 3] }) inOp = dataframeToOperator( df, schemaStr='col1 string, col2 double, col3 long, col4 boolean', op_type='batch') res = inOp.collectToDataframe() print(res.dtypes) print(res) self.assertEqual(res.dtypes[0], pd.StringDtype()) self.assertEqual(res.dtypes[1], np.float64) self.assertEqual(res.dtypes[2], pd.Int64Dtype()) self.assertEqual(res.dtypes[3], pd.BooleanDtype()) self.assertTrue(pd.isna(res["col1"][2])) self.assertTrue(pd.isna(res["col2"][2])) self.assertTrue(pd.isna(res["col3"][2])) self.assertTrue(pd.isna(res["col4"][2]))
def convert_csv_to_json_municipality(fn, export_fn): df = pd.read_csv(fn) df_wide = df \ .pivot_table( index=['Datum', "Provincienaam", "Provinciecode", "Gemeentenaam", "Gemeentecode"], columns='Type', values=['Aantal', 'AantalCumulatief'] ) df_wide.columns = ["{}{}".format(v.lower(), k) for k, v in df_wide] df_wide.reset_index(inplace=True) # convert types for col in [ "Provinciecode", "Gemeentecode", "totaalAantal", "ziekenhuisopnameAantal", "overledenAantal", "totaalAantalCumulatief", "ziekenhuisopnameAantalCumulatief", "overledenAantalCumulatief" ]: df_wide[col] = df_wide[col].astype(pd.Int64Dtype()) json_str = df_wide.to_json(orient="table", indent=2, index=False) json_dict = json.loads(json_str) del json_dict["schema"]["pandas_version"] json_dict["apiVersion"] = "0.1" Path(DATA_FOLDER, "data-municipal").mkdir(exist_ok=True, parents=True) with open(Path(DATA_FOLDER, "data-municipal", export_fn), "w") as f: json.dump(json_dict, f, indent=2)
def test_extension_types(df): df["c"] = pd.Series(np.arange(100.0)) df["d"] = pd.Series(np.arange(100), dtype=pd.Int64Dtype()) df.loc[df.index[::2], "c"] = np.nan df.loc[df.index[::2], "d"] = pd.NA res = Description(df) np.testing.assert_allclose(res.frame.c, res.frame.d)
def test_convert_to_nullable_types(): x = [i for i in range(10)] df = pd.DataFrame({ "int32": pd.Series(x, dtype="int32"), "int64": pd.Series(x, dtype="int64"), "float": pd.Series(x, dtype="float64"), "bool": pd.Series(x, dtype=bool), }) assert list(df.dtypes) == [ dtype("int32"), dtype("int64"), dtype("float64"), dtype("bool"), ] df_converted = _convert_to_nullable_types(df) assert list(df_converted.dtypes) == [ pd.Int32Dtype(), pd.Int64Dtype(), dtype("float64"), dtype("bool"), ]
def get_gem(): """Grab the GEM data from the inputs directory and do light cleaning.""" # grab the data, d gem = pd.read_excel(PATH_GEM) gem = pudl.helpers.simplify_columns(gem).dropna(how='all') # lets only look at the columns we need. gem = (gem[[ 'plant_name', 'unit_name', 'other_ids_location', 'other_ids_unit' ]].add_suffix('_gem')) # extract the plant IDs and generator IDS from these two unit columns. # assign types! force the plant id into an float before a nullable int # hopefully pandas will let you jump straight to ints soon... # https://github.com/pandas-dev/pandas/issues/25472 gem.loc[:, 'plant_id_eia'] = gem.other_ids_location_gem.apply( lambda x: literal_eval(x)['EIA']).astype('float').astype('Int64') gem.loc[:, 'generator_ids_eia'] = gem.other_ids_unit_gem.apply( lambda x: literal_eval(x)['EIA']) # set a GEM specific unit id so we can make a table with # plants, "units" and generator IDs gem = (gem.reset_index().rename(columns={ 'index': 'unit_id_gem' }).astype({ 'unit_id_gem': pd.Int64Dtype() }).drop(columns=['other_ids_location_gem', 'other_ids_unit_gem'])) return gem
def integer_type_mapping( use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]: if use_extension_types: return { IntegerType.INT8: pd.Int8Dtype(), IntegerType.UINT8: pd.UInt8Dtype(), IntegerType.INT16: pd.Int16Dtype(), IntegerType.UINT16: pd.UInt16Dtype(), IntegerType.INT24: pd.Int32Dtype(), IntegerType.UINT24: pd.Int32Dtype(), IntegerType.INT32: pd.Int32Dtype(), IntegerType.UINT32: pd.UInt32Dtype(), IntegerType.INT64: pd.Int64Dtype(), IntegerType.UINT64: pd.UInt64Dtype(), } else: return { IntegerType.INT8: np.int8, IntegerType.UINT8: np.uint8, IntegerType.INT16: np.int16, IntegerType.UINT16: np.uint16, IntegerType.INT24: np.int32, IntegerType.UINT24: np.uint32, IntegerType.INT32: np.int32, IntegerType.UINT32: np.uint32, IntegerType.INT64: np.int64, IntegerType.UINT64: np.uint64, }
def get_nems_year(year): """ Get NEMS data for a single year. Args: year (integer): four-digit year. Must be a key in NEMS_FILE_NAMES. Returns: pandas.DataFrame: """ if year not in NEMS_FILE_NAMES: raise AssertionError( f"{year} not in NEMS_FILE_NAMES map. Add new year and file to " "NEMS_FILE_NAMES or try different year. Current years are: " f"{NEMS_FILE_NAMES.keys}") nems_df = ( pd.read_csv( PATH_INPUTS / NEMS_FILE_NAMES[year], delimiter=":", names=get_nems_headers(), dtype={ 'Plant ID': pd.Int64Dtype(), 'EFD Fuel Codes.1': 'string', # this is for memory 'EFD Fuel Codes.2': 'string', # mixed string/int cols }).assign(report_year=year, report_date=f'{year}-01-01')) return nems_df
def pyarrow2pandas_extension( # pylint: disable=too-many-branches,too-many-return-statements dtype: pa.DataType, ) -> Optional[pd.api.extensions.ExtensionDtype]: """Pyarrow to Pandas data types conversion.""" if pa.types.is_int8(dtype): return pd.Int8Dtype() if pa.types.is_int16(dtype): return pd.Int16Dtype() if pa.types.is_int32(dtype): return pd.Int32Dtype() if pa.types.is_int64(dtype): return pd.Int64Dtype() if pa.types.is_uint8(dtype): return pd.UInt8Dtype() if pa.types.is_uint16(dtype): return pd.UInt16Dtype() if pa.types.is_uint32(dtype): return pd.UInt32Dtype() if pa.types.is_uint64(dtype): return pd.UInt64Dtype() if pa.types.is_boolean(dtype): return pd.BooleanDtype() if pa.types.is_string(dtype): return pd.StringDtype() return None
def get_nems(): """Grab NEMS and perform basic column cleaning.""" # first grab each year of NEMS data nems_dfs = [] for year in NEMS_FILE_NAMES.keys(): nems_dfs.append(get_nems_year(year)) # then squish them together and apply common cleaning nems_df = (pd.concat(nems_dfs).rename( columns={ 'Plant ID': 'plant_id_eia', 'Unit ID': 'generator_id', 'Name Plate Capacity (shared if multiple owners) (MW)': 'capacity_mw', 'Average Capacity Factor': 'capacity_factor', 'Annual Investment in Capital Additions (87$/kW)': 'capex_annual_per_kw_87', 'Variable O&M Cost (87$/MWH)': 'variable_om_mwh_87', 'Fixed O&M Cost (87$/kW)': 'fixed_om_kw_87', }).astype({ 'report_date': 'datetime64[ns]', 'report_year': pd.Int64Dtype(), 'fixed_om_kw_87': 'float32', 'variable_om_mwh_87': 'float32', })) # theres a ton of trailing zeros on the generator_id column which gotta go # this is the only string column we need rn, but beware if others are used nems_df.loc[nems_df['generator_id'].notnull(), 'generator_id'] = ( nems_df.loc[nems_df['generator_id'].notnull(), 'generator_id'].astype(str).str.strip()) return nems_df
def create_informative_trials_column( df: pd.DataFrame, as_column: str = "Wait Index", group_by: List[str] = ("Subject", "Stimulus Vocalizer", "Stimulus Call Type", "Stimulus Class"), in_place: bool = False, ): """Create a column showing the informative trial index of each stim relative to the start of the experiment By default, groups trials by Subject, Stimulus Vocalizer, Stimulus Call Type, and Stimulus Class (rewarded/nonrewarded) To do informative trial analysis for a given block (e.g. the informative trials since the start of the last day of testing) these need to be shifted by subtracting off the "Wait Index" of the first trial prior to the requested range. The normal procedure to do this would be to create a slice of the dataframe you would like to analyze and then call create_informative_trials_column() on it. """ group_by = list(group_by) if not in_place: df = df.copy() df[as_column] = np.nan df[as_column] = df[as_column].astype(pd.Int64Dtype()) # Loop over each stimulus, and count the number of non-interrupted trials that have been seen previously for _, sub_df in df.groupby(group_by): waits_seen = 0 for idx in sub_df.index: df.loc[idx, as_column] = waits_seen if sub_df.loc[idx, "Interrupt"] == False: waits_seen += 1 return df
def fix_colnames(df): ''' Standardise column names and typing. ''' cols = { colname: colname.casefold().replace('?', '').replace(' ', ' ') for colname in df.columns } df = df.rename(columns=cols) # This is not the most elegant solution but workable for now df['phone number'] = df['phone number'].astype( str).apply(lambda x: x.replace(' ', '')).apply( lambda x: f'0{x.replace(".0", "")}' ) #if not x.startswith('0') else x.replace(".0", "")) for col in [ 'age', 'number of adults', 'how many of these adults are over 70', 'number of children', 'how many of these children are over 12' ]: df[col] = df[col].astype(pd.Int64Dtype()) for col in [ 'vegetarian, halal or kosher', 'do they have a cooker', 'do they have a hob', 'do they have a kettle', 'do they have a microwave' ]: # boolean conversion df[col] = df[col].apply(lambda x: True if x.casefold() == 'yes' else False) return df
def convert_data(df, column, type): if type == int: try: casted_vals = [] for val in df[column]: if pd.notna(val): casted_vals.append(int(val)) else: casted_vals.append(val) df[column] = pd.Series(casted_vals, dtype=pd.Int64Dtype()) except: try: df = df.astype({column: float}) except: pass elif type == float: try: df = df.astype({column: float}) except: pass elif type == bool: df[column] = df[column].map(BOOLEAN_VALUES) else: return df return df
def test_issue502(summarizer, typeset): series = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype=pd.Int64Dtype()) result = describe_1d(series, summarizer, typeset) assert result["min"] == 1 assert result["max"] == 11