def check(a, b, expected): expected_val = expected not_expected_val = not expected if numpy_support.version >= (1, 16): # since np 1.16 all NaT magnitude comparisons including equality # are False (as NaT == NaT is now False) if np.isnat(a) or np.isnat(a): expected_val = False not_expected_val = False self.assertPreciseEqual(le(a, b), expected_val) self.assertPreciseEqual(gt(a, b), not_expected_val)
def check(a, b, expected): expected_val = expected not_expected_val = not expected if numpy_support.version >= (1, 16): # since np 1.16 all NaT == comparisons are False, including # NaT==NaT, conversely != is True if np.isnat(a) or np.isnat(a): expected_val = False not_expected_val = True self.assertPreciseEqual(eq(a, b), expected_val) self.assertPreciseEqual(eq(b, a), expected_val) self.assertPreciseEqual(ne(a, b), not_expected_val) self.assertPreciseEqual(ne(b, a), not_expected_val)
def convert_value(value, value_type, tz=None): if value_type == "time": if np.isnat(value): return str(Table.NAT_VALUE) return DateType(value, tz) else: return value
def check_lt(a, b, expected): expected_val = expected not_expected_val = not expected if numpy_support.version >= (1, 16): # since np 1.16 all NaT magnitude comparisons including equality # are False (as NaT == NaT is now False) if np.isnat(a) or np.isnat(b): expected_val = False not_expected_val = False with self.silence_numpy_warnings(): lt = self.jit(lt_usecase) self.assertPreciseEqual(lt(a, b), expected_val, (a, b, expected)) self.assertPreciseEqual(gt(b, a), expected_val, (a, b, expected)) self.assertPreciseEqual(ge(a, b), not_expected_val, (a, b, expected)) self.assertPreciseEqual(le(b, a), not_expected_val, (a, b, expected)) if expected_val: # If true, then values are not equal check_eq(a, b, False) # Did we get it right? self.assertPreciseEqual(a < b, expected_val)
def check_eq(a, b, expected): expected_val = expected not_expected_val = not expected if numpy_support.version >= (1, 16): # since np 1.16 all NaT comparisons bar != are False, including # NaT==NaT if np.isnat(a) or np.isnat(b): expected_val = False not_expected_val = True self.assertFalse(le(a, b), (a, b)) self.assertFalse(ge(a, b), (a, b)) self.assertFalse(le(b, a), (a, b)) self.assertFalse(ge(b, a), (a, b)) self.assertFalse(lt(a, b), (a, b)) self.assertFalse(gt(a, b), (a, b)) self.assertFalse(lt(b, a), (a, b)) self.assertFalse(gt(b, a), (a, b)) with self.silence_numpy_warnings(): self.assertPreciseEqual(eq(a, b), expected_val, (a, b, expected)) self.assertPreciseEqual(eq(b, a), expected_val, (a, b, expected)) self.assertPreciseEqual(ne(a, b), not_expected_val, (a, b, expected)) self.assertPreciseEqual(ne(b, a), not_expected_val, (a, b, expected)) if expected_val: # If equal, then equal-ordered comparisons are true self.assertTrue(le(a, b), (a, b)) self.assertTrue(ge(a, b), (a, b)) self.assertTrue(le(b, a), (a, b)) self.assertTrue(ge(b, a), (a, b)) # and strictly ordered comparisons are false self.assertFalse(lt(a, b), (a, b)) self.assertFalse(gt(a, b), (a, b)) self.assertFalse(lt(b, a), (a, b)) self.assertFalse(gt(b, a), (a, b)) # Did we get it right? self.assertPreciseEqual(a == b, expected_val)
def test_time_to_time(self, from_dt, to_dt, expected_casting, expected_view_off, nom, denom): from_dt = np.dtype(from_dt) if to_dt is not None: to_dt = np.dtype(to_dt) # Test a few values for casting (results generated with NumPy 1.19) values = np.array([-2**63, 1, 2**63 - 1, 10000, -10000, 2**32]) values = values.astype( np.dtype("int64").newbyteorder(from_dt.byteorder)) assert values.dtype.byteorder == from_dt.byteorder assert np.isnat(values.view(from_dt)[0]) DType = type(from_dt) cast = get_castingimpl(DType, DType) casting, (from_res, to_res), view_off = cast._resolve_descriptors( (from_dt, to_dt)) assert from_res is from_dt assert to_res is to_dt or to_dt is None assert casting == expected_casting assert view_off == expected_view_off if nom is not None: expected_out = (values * nom // denom).view(to_res) expected_out[0] = "NaT" else: expected_out = np.empty_like(values) expected_out[...] = denom expected_out = expected_out.view(to_dt) orig_arr = values.view(from_dt) orig_out = np.empty_like(expected_out) if casting == Casting.unsafe and (to_dt == "m8" or to_dt == "M8"): # Casting from non-generic to generic units is an error and should # probably be reported as an invalid cast earlier. with pytest.raises(ValueError): cast._simple_strided_call((orig_arr, orig_out)) return for aligned in [True, True]: for contig in [True, True]: arr, out = self.get_data_variation(orig_arr, orig_out, aligned, contig) out[...] = 0 cast._simple_strided_call((arr, out)) assert_array_equal(out.view("int64"), expected_out.view("int64"))
def test_nanmean_skipna_false(self, dtype): arr = np.arange(12).astype(np.int64).view(dtype).reshape(4, 3) arr[-1, -1] = "NaT" result = nanops.nanmean(arr, skipna=False) assert np.isnat(result) assert result.dtype == dtype result = nanops.nanmean(arr, axis=0, skipna=False) expected = np.array([4, 5, "NaT"], dtype=arr.dtype) tm.assert_numpy_array_equal(result, expected) result = nanops.nanmean(arr, axis=1, skipna=False) expected = np.array([arr[0, 1], arr[1, 1], arr[2, 1], arr[-1, -1]]) tm.assert_numpy_array_equal(result, expected)
def compute(self, today, assets, out, values, dates, is_cum): filled = np.where(np.isnat(dates), np.datetime64('1970-03-31'), dates) # 按股票计算位置变化及季度乘子 locs = map(self._locs_and_quarterly_multiplier, filled.T) if is_cum: # 使用季度因子加权平均 ttm = np.array([ nanmean(value[loc] * ms) for value, (loc, ms) in zip(values.T, locs) ]).T else: # 非累计的原始数据,只需要简单相加,不需要季度因子调整 ttm = np.array([ nansum(value[loc]) for value, (loc, _) in zip(values.T, locs) ]).T out[:] = ttm
def PandasData(request): if request.method == 'POST': form = PandasdataForm(request.POST) if form.is_valid(): username = form.cleaned_data['username'] df = read_frame(AuthUser.objects.all(), fieldnames=[ 'username', 'login_num', 'is_pay', 'mobilephone', 'date_joined', 'last_login' ]) try: AuthUser.objects.get(username=username) except: return render(request, 'pandas_data.html', {'messageerror': 'The username does not exist'}) df_select = df[df.username == username] username = df_select.username.values[0] login_num = df_select.login_num.values[0] register_time = df_select.date_joined.values[0] last_login = df_select.last_login.values[0] is_login = not np.isnat(last_login) register_time = str(np.datetime_as_string(register_time)).split("T")[0] + " " + \ str(np.datetime_as_string(register_time)).split("T")[1].split(".")[0] if is_login and last_login > np.datetime64('today'): is_today = True else: is_today = False if is_login: last_login = str(np.datetime_as_string(last_login)).split("T")[0] + " " + \ str(np.datetime_as_string(last_login)).split("T")[1].split(".")[0] message = { 'username': username, 'login_num': login_num, 'register_time': register_time, 'last_login': last_login, 'is_login': is_login, 'is_today': is_today } return render(request, 'pandas_data.html', {'message': message})
def get_mean_acq_time(scene): """Compute mean scanline acquisition time over all bands.""" dtype = scene['IR_108'].coords['acq_time'].dtype # Convert timestamps to float to facilitate averaging. Caveat: NaT is # not converted to NaN, but to -9.22E18. So we have to set these elements # to NaN manually acq_times = [] for band in BANDNAMES: acq_time = scene[band].coords['acq_time'].drop_vars(['acq_time']) is_nat = np.isnat(acq_time.values) acq_time = acq_time.astype(int).where(np.logical_not(is_nat)) acq_times.append(acq_time) # Compute average over all bands (skip NaNs) acq_times = xr.concat(acq_times, 'bands') return acq_times.mean(dim='bands', skipna=True).astype(dtype)
def active_thermal_capacity(db_plant, year, dict_country, dict_id): active_plant = db_plant.loc[ (db_plant['UnitOperOnlineDate'] < pd.Timestamp(year, 1, 1)) & (db_plant['UnitOperRetireDate'] > pd.Timestamp(year, 12, 31)) | np.isnat(db_plant['UnitOperRetireDate'])] active_plant = active_plant.loc[(active_plant['MedeaType'] < 60) | (active_plant['MedeaType'] >= 70)] aggregate_thermal_capacity = active_plant.groupby( ['MedeaType', 'PlantCountry'])['UnitNameplate'].sum().to_frame() / 1000 if dict_country: aggregate_thermal_capacity.rename(index=dict_country, columns={'UnitNameplate': 'cap'}, inplace=True) aggregate_thermal_capacity = aggregate_thermal_capacity.unstack(-1) aggregate_thermal_capacity.drop(0.0, axis=0, inplace=True) if dict_id: aggregate_thermal_capacity.rename(index=dict_id, inplace=True) return aggregate_thermal_capacity
def get_timedelta_str(arr: ndarray): max_val = np.abs(arr[~np.isnat(arr)].view('int64')).max() if max_val < 10**3: unit = 'ns' elif max_val < 10**6: unit = 'us' elif max_val < 10**9: unit = 'ms' elif max_val < 60 * 10**9: unit = 's' elif max_val < 3600 * 10**9: unit = 'm' elif max_val < 3600 * 24 * 10**9: unit = 'h' else: unit = 'D' return unit
def visitToDatetime(date, arrival, depart): """ Converts arrival and departure times from the TAWO visit log into a start and end datetime :return: datetime column with before, after tuple Warning: The fact that this function works is honestly beyond me. Only Jesus can understand it at this point. It is an amalgamation of frustration and defeat with Python datetimes. Good luck. """ import calendar import datetime as dt # replace NaT values with previous date mask = np.isnat(date) # boolean array of where NaT values are idx = np.flatnonzero(mask) # indexes of NaT nidx = np.flatnonzero(~mask) # other indexs date[mask] = pd.Timestamp('1980-01-01') # replace NaT with 2000 date date = [pd.Timestamp(x) for x in date] # convert n64dt to timestamp date = [calendar.timegm(x.timetuple()) + 86400 for x in date] # convert timestamp to unix time value date = np.array(date) # convert back to numpy array nonnats = date[~mask] # get actual non NaT values date[mask] = np.interp(idx, nidx, nonnats) - 86400 # interp unix date values date = [dt.datetime.fromtimestamp(x) for x in date] # convert unix timestamp to datetime s, d = [], [] for i in range(len(arrival)): base = dt.datetime(date[i].year, # collect the base remove hour date[i].month, date[i].day) # get start hour information shour = int(str(arrival[i])[:2]) sminute = int(str(arrival[i])[2:]) start = base + pd.Timedelta(f'{shour} hours') + pd.Timedelta(f'{sminute} minutes') # departing hour information dhour = int(str(depart[i])[:2]) dminute = int(str(depart[i])[2:]) end = base + pd.Timedelta(f'{dhour} hours') + pd.Timedelta(f'{dminute} minutes') # create and append a tuple s.append(start) d.append(end) return s, d
def post(self, request): username = request.POST.get('username') df = read_frame(Client.objects.all(), fieldnames=[ 'username', 'login_num', 'is_pay', 'telephone', 'register_time', 'last_login' ]) try: Client.objects.get(username=username) except Client.DoesNotExist: return pandas_request_render(request, error="User not exist") df_select = df[df.username == username] username = df_select.username.values[0] login_num = df_select.login_num.values[0] register_time = df_select.register_time.values[0] last_login = df_select.last_login.values[0] have_login = not np.isnat(last_login) register_time = str(np.datetime_as_string(register_time)).split("T")[0] + " " + \ str(np.datetime_as_string(register_time)).split("T")[1].split(".")[0] if have_login and last_login > np.datetime64('today'): is_today = True else: is_today = False if have_login: last_login = str(np.datetime_as_string(last_login)).split("T")[0] + " " + \ str(np.datetime_as_string(last_login)).split("T")[1].split(".")[0] res = { 'username': username, 'login_num': login_num, 'register_time': register_time, 'last_login': last_login, 'have_login': have_login, 'is_today': is_today } return pandas_request_render(request, res=res)
def test_european_option_on_zcb(self) -> None: model = _make_model() notional = 1234 currency = "USD" strike = 1000 zcb = ZeroCouponBond(model.dategrid[-2], notional, currency) opt = EuropeanOption(model.dategrid[-2], zcb - strike * One(currency)) cf = model.generate_cashflows(opt) self.assertEqual(cf.currencies.shape, (3,)) self.assertEqual(cf.currencies[0], currency) self.assertEqual(cf.currencies[1], currency) self.assertEqual(cf.currencies[2], "NNN") self.assertEqual(cf.cashflows.shape, (model.nsim, 3)) self.assertTrue((cf.cashflows["date"][:, 0] == model.dategrid[-2]).all()) self.assertTrue((cf.cashflows["value"][:, 0] == notional).all()) self.assertTrue((cf.cashflows["date"][:, 1] == model.dategrid[-2]).all()) self.assertTrue((cf.cashflows["value"][:, 1] == -strike).all()) self.assertTrue((np.isnat(cf.cashflows["date"][:, 2])).all()) self.assertTrue((cf.cashflows["value"][:, 2] == 0).all())
def test_or_cashflow_generation(self) -> None: model = _make_model() c2 = One("EUR") | When(At(model.dategrid[-1]), One("EUR")) self.assertRaises(NotImplementedError, lambda: model.generate_cashflows(c2)) c3 = One("EUR") | 2 * One("EUR") cf = model.generate_cashflows(c3) self.assertEqual(cf.currencies.shape, (2,)) self.assertEqual(cf.currencies[0], "EUR") self.assertEqual(cf.currencies[1], "EUR") self.assertEqual(cf.cashflows.shape, (model.nsim, 2)) self.assertTrue((cf.cashflows["value"][:, 0] == 0).all()) self.assertTrue((np.isnat(cf.cashflows["date"][:, 0])).all()) self.assertTrue((cf.cashflows["value"][:, 1] == 2).all()) self.assertTrue((cf.cashflows["date"][:, 1] == model.eval_date).all()) c4 = One("EUR") | One("USD") cf4 = model.generate_cashflows(c4) self.assertEqual(cf4.currencies.shape, (2,)) self.assertEqual(cf4.currencies[0], "EUR") self.assertEqual(cf4.currencies[1], "USD")
def scalar_broadcast_to(scalar, size, dtype=None): if isinstance(size, (tuple, list)): size = size[0] if scalar is None or ( isinstance(scalar, (np.datetime64, np.timedelta64)) and np.isnat(scalar) ): if dtype is None: dtype = "object" return column.column_empty(size, dtype=dtype, masked=True) if isinstance(scalar, pd.Categorical): if dtype is None: return _categorical_scalar_broadcast_to(scalar, size) else: return scalar_broadcast_to(scalar.categories[0], size).astype( dtype ) if isinstance(scalar, decimal.Decimal): if dtype is None: dtype = cudf.Decimal64Dtype._from_decimal(scalar) out_col = column.column_empty(size, dtype=dtype) if out_col.size != 0: out_col[:] = scalar return out_col scalar = to_cudf_compatible_scalar(scalar, dtype=dtype) dtype = scalar.dtype if np.dtype(dtype).kind in ("O", "U"): gather_map = column.full(size, 0, dtype="int32") scalar_str_col = column.as_column([scalar], dtype="str") return scalar_str_col[gather_map] else: out_col = column.column_empty(size, dtype=dtype) if out_col.size != 0: out_col.data_array_view[:] = scalar return out_col
def tidy_cases_lambda(interim_data, remove_territories=True): #Remove non-existent notification dates interim_data = interim_data[~np.isnat(interim_data. NOTIFICATION_RECEIVE_DATE)] #Filter out territories if (remove_territories): df_linel = interim_data[(interim_data['STATE'] != 'NT') & (interim_data['STATE'] != 'ACT')] #Melt down so that imported and local are no longer columns. Allows multiple draws for infection date. #i.e. create linelist data df_linel = df_linel.melt(id_vars=['NOTIFICATION_RECEIVE_DATE', 'STATE'], var_name='SOURCE', value_name='n_cases') #Reset index or the joining doesn't work df_linel = df_linel[df_linel.n_cases != 0] df_linel = df_linel.reset_index(drop=True) return (df_linel)
def _add_timedelta_arraylike( self, other: TimedeltaArray | npt.NDArray[np.timedelta64] ) -> PeriodArray: """ Parameters ---------- other : TimedeltaArray or ndarray[timedelta64] Returns ------- PeriodArray """ freq = self.freq if not isinstance(freq, Tick): # We cannot add timedelta-like to non-tick PeriodArray raise TypeError( f"Cannot add or subtract timedelta64[ns] dtype from {self.dtype}" ) dtype = np.dtype(f"m8[{freq._td64_unit}]") try: delta = astype_overflowsafe(np.asarray(other), dtype=dtype, copy=False, round_ok=False) except ValueError as err: # e.g. if we have minutes freq and try to add 30s # "Cannot losslessly convert units" raise IncompatibleFrequency( "Cannot add/subtract timedelta-like from PeriodArray that is " "not an integer multiple of the PeriodArray's freq.") from err b_mask = np.isnat(delta) res_values = algos.checked_add_with_arr(self.asi8, delta.view("i8"), arr_mask=self._isnan, b_mask=b_mask) np.putmask(res_values, self._isnan | b_mask, iNaT) return type(self)(res_values, freq=self.freq)
def _calc_global_values(self, array): """Calculate all the values of the Transform that are dependent on all the examples of the dataset. (e.g. mean, standard deviation, unique category values, etc.) This method must be run before any actual transformation can be done. Parameters ---------- array : np.ndarray Some of the data that will be transformed. """ self.input_dtype = array.dtype if len(array.shape) < 2 or array.shape[1] != 2: raise ValueError("Array must have exactly two columns. The first being the time, and the second being the amplitude.") batch_size = float(array.shape[0]) total_examples = self.num_examples + batch_size if not batch_size: return time_array = array[:, 0].astype(np.datetime64) time_array[np.isnat(time_array)] = self.fill_nat_func(time_array) # Get the maximum time and convert it to dtype end_datetime = (self.end_datetime - self.zero_datetime) / np.timedelta64(self.num_units, self.time_unit) end_datetime = end_datetime.astype(self.dtype) # Convert to dtype and scale to values between 0 and 1 time_array = (time_array - self.zero_datetime)/np.timedelta64(self.num_units, self.time_unit) time_array = time_array.astype(self.dtype) time_array = time_array / end_datetime amp_array = array[:, 1: 2].astype(np.float64) amp_array[np.isnan(amp_array)] = self.fill_nan_func(amp_array) amp_array = np.tile(amp_array, [1, self.num_frequencies]) exp = np.exp(-2.0j * np.pi * np.tensordot(time_array, self.w_k, [[], []])) self.X_k = batch_size / total_examples * np.mean(amp_array * exp, axis=0) + self.num_examples / total_examples * self.X_k self.num_examples += batch_size
def subset_for_oco2(observations_path, run_directory_species_conc, run_directory_level_edge, output_path, attributes, is_subset_hourly): logger.debug('Reading observation file %s', observations_path) observation_ds = xarray.open_dataset( observations_path, decode_times=False ) \ .load() \ .sortby('time') # HACK(mgnb): hardcoding the unit rather than reading it from the file assert (observation_ds['time'].attrs['units'] == 'seconds since 1970-01-01 00:00:00') observation_ds['time'] = ('sounding_id', ( np.datetime64('1970-01-01 00:00:00') + observation_ds['time'].values.astype('timedelta64[s]'))) valid_times = np.argwhere( np.logical_not(np.isnat(observation_ds['time'].values))) observation_ds = observation_ds \ .isel(sounding_id=valid_times[:, 0]) matched_ds = GEOSChemSubsetter(observation_ds, run_directory_species_conc, run_directory_level_edge, is_subset_hourly).match_observations() if matched_ds is None: logger.debug('Nothing to subset') return matched_ds.attrs = attributes logger.debug('Saving results to %s', output_path) compression = dict(zlib=True, complevel=6) encoding = {var: compression for var in matched_ds.data_vars} matched_ds.to_netcdf(output_path, encoding=encoding) logger.debug('Done')
def set_jds(self, val1, val2): # If there are any masked values in the ``val1`` datetime64 array # ('NaT') then stub them with a valid date so downstream parse_string # will work. The value under the mask is arbitrary but a "modern" date # is good. mask = np.isnat(val1) masked = np.any(mask) if masked: val1 = val1.copy() val1[mask] = '2000' # Make sure M(onth) and Y(ear) dates will parse and convert to bytestring if val1.dtype.name in ['datetime64[M]', 'datetime64[Y]']: val1 = val1.astype('datetime64[D]') val1 = val1.astype('S') # Standard ISO string parsing now super().set_jds(val1, val2) # Finally apply mask if necessary if masked: self.jd2[mask] = np.nan
def get_solar_angles(scene, lons, lats): """Compute solar angles. Compute angles for each scanline using their acquisition time to account for the earth's rotation over the course of one scan. Returns: Solar azimuth angle, Solar zenith angle in degrees """ suna = np.full(lons.shape, np.nan) sunz = np.full(lons.shape, np.nan) mean_acq_time = get_mean_acq_time(scene) for line, acq_time in enumerate(mean_acq_time.values): if np.isnat(acq_time): continue _, suna_line = get_alt_az(acq_time, lons[line, :], lats[line, :]) suna_line = np.rad2deg(suna_line) suna[line, :] = suna_line sunz[line, :] = sun_zenith_angle(acq_time, lons[line, :], lats[line, :]) return suna, sunz
def values_list(self): all_values = [] for idx, val in self.write_values.iterrows(): values = collections.OrderedDict() for k, v in val.iteritems(): if isinstance(v, dict) or isinstance(v, list): v = json.dumps(v) if isinstance(v, np.bool_): v = bool(v) if isinstance(v, np.datetime64) and np.isnat(v): v = None if isinstance(v, pd._libs.tslib.NaTType): v = None if isinstance(v, float) and np.isnan(v): v = None values['%s_%s' % (k, idx)] = v all_values += [values] return all_values
def get_last_played_match_vector( season_matches_dfs: typing.List[pd.DataFrame]) -> pd.Series: """ :param season_matches_dfs: :return: """ df_concat = pd.concat(season_matches_dfs) sorted_df = df_concat.sort_values(by=['TeamName', 'MatchDate'], ascending=True) date_diff_df = sorted_df.groupby('TeamName').MatchDate.diff() # check we're seeing as many null values as expected (should be = n of unique teams in dataset) unique_teams = df_concat.TeamNames.nunique() null_vals = date_diff_df[np.isnat(date_diff_df.values)].shape[0] assert unique_teams == null_vals date_diff_df = date_diff_df.fillna(0) return date_diff_df.dt.total_seconds()
def evaluate(self, array_data): """ Calculate range value of the array and store it to history Parameters: """ result_object = ResultObject(None, None, None, CommandStatus.Error) array = array_data.data if numpy.issubdtype(array.dtype, numpy.number): idx = numpy.logical_not(numpy.isnan(array)) elif numpy.issubdtype(array.dtype, numpy.datetime64): idx = numpy.logical_not(numpy.isnat(array)) else: Printer.Print("The array is not supported type so cannot find max") return result_object if StatContainer.conditional_array is not None and StatContainer.conditional_array.data.size == array.size: idx = numpy.logical_and(idx, StatContainer.conditional_array.data) max_val = numpy.max(array[idx]) min_val = numpy.min(array[idx]) range_val = max_val - min_val result_object = ResultObject(range_val, [], DataType.array, CommandStatus.Success) result_object.createName(array_data.keyword_list, command_name=self.commandTags()[0], set_keyword_list=True) df_new = pd.DataFrame() df_new['Feature'] = [array_data.name] df_new['Range'] = [range_val] df_new['Minimum'] = [min_val] df_new['Maximum'] = [max_val] TablePrinter.printDataFrame(df_new) # Printer.Print("Range of", array_data.name, "is", range_val, # "from", min_val, "to", max_val) return result_object
def addBaselineCreat(self, df, eGFR_impute=None): ''' Adds the baseline creatinine to a dataframe. The baseline creatinine is defined as the median of the outpatient creatinine values from 365 to 7 days prior to admission. Args: df (pd.DataFrame): dataframe, typically of a single patient. eGFR_impute (bool): boolean, whether or not to impute the null baseline creatinines with the age/sex/race and eGFR of 75 Returns: df (pd.DataFrame): dataframe with baseline creatinine values added in ''' if eGFR_impute is None: eGFR_impute = self.eGFR_impute self.eGFR_impute = eGFR_impute split_dfs = list() unique_adms = df[self.admission].unique() for adm in unique_adms[~np.isnat(unique_adms)]: adm_df = df.loc[df[self.admission] == adm] adm_df.loc[:, self.baseline_creat] = adm_df[ ~adm_df[self.inpatient]].set_index( self.time).loc[adm - pd.Timedelta(days=365):adm - pd.Timedelta(days=7), self.creatinine].median() split_dfs.append(adm_df) df = pd.concat(split_dfs) if self.eGFR_impute: df.loc[df[self.baseline_creat].isnull(), self.baseline_creat] = df[ df[self.baseline_creat].isnull()].apply( lambda d: self.eGFRbasedCreatImputation( d[self.age], d[self.sex], d[self.race]), axis=1) return df
def convert_time_to_most_suitable_unit(arr): from pandas import Series from numpy import array, isnat, diff, NaN, nanmedian # test if dates arr = array(arr) if isnat(arr).any(): return arr # convert to datetime[ns] floats time = array(arr).astype("datetime64[ns]").astype(float) # get the difference between time steps delta_time = diff(time) # approximate the best unit (without losing info) time_denominators = dict(ns=1, s=1e9, m=60, h=60, D=24, M=30, Y=12) dt_as_frac_of_unit = Series(index=time_denominators.keys()) denominator = 1 for key in time_denominators: denominator *= time_denominators[key] frac = nanmedian(delta_time / denominator) # only units that will not lose time are kept dt_as_frac_of_unit[key] = frac if frac >= 1 else NaN # if the difference is not near enough the unit, exclude it # e.g. 35 day interval will eliminate Month as a unit if not ((dt_as_frac_of_unit - 1) < 0.05).any(): dt_as_frac_of_unit = dt_as_frac_of_unit.where(lambda a: (a - 1) >= 1) unit = dt_as_frac_of_unit.idxmin() # convert time units to appropriate units # dtype: datetime64 must be attached to unit # must be float when astype(float) is applied time_converted = arr.astype(f"datetime64[{unit}]") return time_converted
def get_datetime_str(arr: ndarray): dt = {0: 'ns', 1: 'us', 2: 'ms', 3: 's', 4: 'D'} arr = arr[~np.isnat(arr)].view('int64') counts = np.zeros(len(arr), dtype='int64') for i, val in enumerate(arr): if val == 0: counts[i] = 4 continue dec = decimal.Decimal(int(val)).as_tuple() ct = 0 for digit in dec.digits[::-1]: if digit == 0: ct += 1 else: break if ct >= 11: counts[i] = 4 else: counts[i] = ct // 3 return dt[counts.min()]
def normalize_binop_value(self, other): if isinstance(other, dt.timedelta): other = np.timedelta64(other) elif isinstance(other, pd.Timestamp): other = other.to_datetime64() elif isinstance(other, pd.Timedelta): other = other.to_timedelta64() if isinstance(other, np.timedelta64): other_time_unit = cudf.utils.dtypes.get_time_unit(other) if np.isnat(other): return as_scalar(val=None, dtype=self.dtype) if other_time_unit not in ("s", "ms", "ns", "us"): other = other.astype("timedelta64[s]") else: common_dtype = determine_out_dtype(self.dtype, other.dtype) other = other.astype(common_dtype) return as_scalar(other) elif np.isscalar(other): return as_scalar(other) else: raise TypeError("cannot normalize {}".format(type(other)))
def make_null_mask(array): """Given a numpy array, return a numpy array of int64s containing the indices of `array` where the value is either invalid or null. Invalid values are: - None - numpy.nat - numpy.nan Args: array (:obj:`numpy.array`) """ mask = [] is_object_or_string_dtype = np.issubdtype(array.dtype, np.str_) or\ np.issubdtype(array.dtype, np.object_) if six.PY2: is_object_or_string_dtype = is_object_or_string_dtype or np.issubdtype( array.dtype, np.unicode_) is_datetime_dtype = np.issubdtype(array.dtype, np.datetime64) or\ np.issubdtype(array.dtype, np.timedelta64) for i, item in enumerate(array): invalid = item is None if not is_object_or_string_dtype: if is_datetime_dtype: invalid = invalid or np.isnat(item) else: invalid = invalid or np.isnan(item) if invalid: mask.append(i) return mask
def isnull(data): data = asarray(data) scalar_type = data.dtype.type if issubclass(scalar_type, (np.datetime64, np.timedelta64)): # datetime types use NaT for null # note: must check timedelta64 before integers, because currently # timedelta64 inherits from np.integer return isnat(data) elif issubclass(scalar_type, np.inexact): # float types use NaN for null return isnan(data) elif issubclass(scalar_type, (np.bool_, np.integer, np.character, np.void)): # these types cannot represent missing values return zeros_like(data, dtype=bool) else: # at this point, array should have dtype=object if isinstance(data, (np.ndarray, dask_array_type)): return pandas_isnull(data) else: # Not reachable yet, but intended for use with other duck array # types. For full consistency with pandas, we should accept None as # a null value as well as NaN, but it isn't clear how to do this # with duck typing. return data != data
def fillna(self, fill_value): if is_scalar(fill_value): if not isinstance(fill_value, Scalar): fill_value = np.datetime64(fill_value, self.time_unit) else: fill_value = column.as_column(fill_value, nan_as_null=False) result = libcudf.replace.replace_nulls(self, fill_value) if isinstance(fill_value, np.datetime64) and np.isnat(fill_value): # If the value we are filling is np.datetime64("NAT") # we set the same mask as current column. # However where there are "<NA>" in the # columns, their corresponding locations # in base_data will contain min(int64) values. return column.build_column( data=result.base_data, dtype=result.dtype, mask=self.base_mask, size=result.size, offset=result.offset, children=result.base_children, ) return result
def releasedate_dist_similarity(google_releasedate, apple_releasedate): if np.isnat(google_releasedate) or np.isnat(apple_releasedate): return 241.040 # the mean of differences that we calculated in the eda else: return np.absolute(google_releasedate - apple_releasedate).astype(int)
ventas_final = pd.DataFrame(sales_dict, columns=columns[:-3], index=df_ventas.index.values) descuentos = df_ventas.loc[:, ['Descuentos', 'Tarjeta D.']] ventas_final = pd.merge(ventas_final, descuentos, left_index=True, right_index=True) ventas_final = ventas_final.iloc[1:] ventas_final['Total Nuevo'] = np.sum(ventas_final, axis=1) ventas_final = pd.concat([df_ventas, df_ventas_nuevo], axis=0) # Agrupar por fecha ventas_agrupadas = ventas_final.groupby(ventas_final.index).sum().reset_index() ventas_agrupadas.rename(columns={'Hora transacción': 'index'}, inplace=True) ventas_agrupadas = ventas_agrupadas.groupby(ventas_agrupadas['index'].dt.date).sum() # Calcular costos totales costos_final = df_costos.reset_index() dates = [] for v, d in enumerate(costos_final['Hora transacción'].values): if np.isnat(d) == True: position = v pass else: d_new = pd.to_datetime(d) year = d_new.year month = d_new.month day = d_new.day dates.append((int(year),int(month),int(day))) n_dates = [] for d in dates: year,month,day = d d_n = dt.datetime(year,month,day) n_dates.append(d_n)
def _assertPreciseEqual(self, first, second, prec='exact', ulps=1, msg=None, ignore_sign_on_zero=False, abs_tol=None): """Recursive workhorse for assertPreciseEqual().""" def _assertNumberEqual(first, second, delta=None): if (delta is None or first == second == 0.0 or math.isinf(first) or math.isinf(second)): self.assertEqual(first, second, msg=msg) # For signed zeros if not ignore_sign_on_zero: try: if math.copysign(1, first) != math.copysign(1, second): self.fail( self._formatMessage(msg, "%s != %s" % (first, second))) except TypeError: pass else: self.assertAlmostEqual(first, second, delta=delta, msg=msg) first_family = self._detect_family(first) second_family = self._detect_family(second) assertion_message = "Type Family mismatch. (%s != %s)" % (first_family, second_family) if msg: assertion_message += ': %s' % (msg,) self.assertEqual(first_family, second_family, msg=assertion_message) # We now know they are in the same comparison family compare_family = first_family # For recognized sequences, recurse if compare_family == "ndarray": dtype = self._fix_dtype(first.dtype) self.assertEqual(dtype, self._fix_dtype(second.dtype)) self.assertEqual(first.ndim, second.ndim, "different number of dimensions") self.assertEqual(first.shape, second.shape, "different shapes") self.assertEqual(first.flags.writeable, second.flags.writeable, "different mutability") # itemsize is already checked by the dtype test above self.assertEqual(self._fix_strides(first), self._fix_strides(second), "different strides") if first.dtype != dtype: first = first.astype(dtype) if second.dtype != dtype: second = second.astype(dtype) for a, b in zip(first.flat, second.flat): self._assertPreciseEqual(a, b, prec, ulps, msg, ignore_sign_on_zero, abs_tol) return elif compare_family == "sequence": self.assertEqual(len(first), len(second), msg=msg) for a, b in zip(first, second): self._assertPreciseEqual(a, b, prec, ulps, msg, ignore_sign_on_zero, abs_tol) return elif compare_family == "exact": exact_comparison = True elif compare_family in ["complex", "approximate"]: exact_comparison = False elif compare_family == "enum": self.assertIs(first.__class__, second.__class__) self._assertPreciseEqual(first.value, second.value, prec, ulps, msg, ignore_sign_on_zero, abs_tol) return elif compare_family == "unknown": # Assume these are non-numeric types: we will fall back # on regular unittest comparison. self.assertIs(first.__class__, second.__class__) exact_comparison = True else: assert 0, "unexpected family" # If a Numpy scalar, check the dtype is exactly the same too # (required for datetime64 and timedelta64). if hasattr(first, 'dtype') and hasattr(second, 'dtype'): self.assertEqual(first.dtype, second.dtype) # Mixing bools and non-bools should always fail if (isinstance(first, self._bool_types) != isinstance(second, self._bool_types)): assertion_message = ("Mismatching return types (%s vs. %s)" % (first.__class__, second.__class__)) if msg: assertion_message += ': %s' % (msg,) self.fail(assertion_message) try: if cmath.isnan(first) and cmath.isnan(second): # The NaNs will compare unequal, skip regular comparison return except TypeError: # Not floats. pass # if absolute comparison is set, use it if abs_tol is not None: if abs_tol == "eps": rtol = np.finfo(type(first)).eps elif isinstance(abs_tol, float): rtol = abs_tol else: raise ValueError("abs_tol is not \"eps\" or a float, found %s" % abs_tol) if abs(first - second) < rtol: return exact_comparison = exact_comparison or prec == 'exact' if not exact_comparison and prec != 'exact': if prec == 'single': bits = 24 elif prec == 'double': bits = 53 else: raise ValueError("unsupported precision %r" % (prec,)) k = 2 ** (ulps - bits - 1) delta = k * (abs(first) + abs(second)) else: delta = None if isinstance(first, self._complex_types): _assertNumberEqual(first.real, second.real, delta) _assertNumberEqual(first.imag, second.imag, delta) elif isinstance(first, (np.timedelta64, np.datetime64)): # Since Np 1.16 NaT == NaT is False, so special comparison needed if numpy_support.version >= (1, 16) and np.isnat(first): self.assertEqual(np.isnat(first), np.isnat(second)) else: _assertNumberEqual(first, second, delta) else: _assertNumberEqual(first, second, delta)
def pandas_to_table(df): # type: (pd.DataFrame) -> Orange.data.Table """ Convert a pandas.DataFrame to a Orange.data.Table instance. """ index = df.index if not isinstance(index, pd.RangeIndex): df = df.reset_index() columns = [] # type: List[Tuple[Orange.data.Variable, np.ndarray]] for header, series in df.items(): # type: (Any, pd.Series) if pdtypes.is_categorical(series): coldata = series.values # type: pd.Categorical categories = [str(c) for c in coldata.categories] var = Orange.data.DiscreteVariable.make( str(header), values=categories, ordered=coldata.ordered ) # Remap the coldata into the var.values order/set coldata = pd.Categorical( coldata, categories=var.values, ordered=coldata.ordered ) codes = coldata.codes assert np.issubdtype(codes.dtype, np.integer) orangecol = np.array(codes, dtype=np.float) orangecol[codes < 0] = np.nan elif pdtypes.is_datetime64_any_dtype(series): # Check that this converts tz local to UTC series = series.astype(np.dtype("M8[ns]")) coldata = series.values # type: np.ndarray assert coldata.dtype == "M8[ns]" mask = np.isnat(coldata) orangecol = coldata.astype(np.int64) / 10 ** 9 orangecol[mask] = np.nan var = Orange.data.TimeVariable.make(str(header)) var.have_date = var.have_time = 1 elif pdtypes.is_object_dtype(series): coldata = series.values assert isinstance(coldata, np.ndarray) orangecol = coldata var = Orange.data.StringVariable.make(str(header)) elif pdtypes.is_integer_dtype(series): coldata = series.values var = Orange.data.ContinuousVariable.make(str(header)) var.number_of_decimals = 0 orangecol = coldata.astype(np.float64) elif pdtypes.is_numeric_dtype(series): orangecol = series.values.astype(np.float64) var = Orange.data.ContinuousVariable.make(str(header)) var._out_format = "%.15g" else: warnings.warn( "Column '{}' with dtype: {} skipped." .format(header, series.dtype), UserWarning ) continue columns.append((var, orangecol)) cols_x = [(var, col) for var, col in columns if var.is_primitive()] cols_m = [(var, col) for var, col in columns if not var.is_primitive()] variables = [v for v, _ in cols_x] if cols_x: X = np.column_stack([a for _, a in cols_x]) else: X = np.empty((df.shape[0], 0), dtype=np.float) metas = [v for v, _ in cols_m] if cols_m: M = np.column_stack([a for _, a in cols_m]) else: M = None domain = Orange.data.Domain(variables, metas=metas) return Orange.data.Table.from_numpy(domain, X, None, M)