def _sanitize_dates(start: Union[int, date, datetime], end: Union[int, date, datetime]) -> Sanitize_Type: """ Return (datetime_start, datetime_end) tuple """ if start and end: if start > end: raise ValueError("end must be after start") else: raise ValueError("start and or end must contain valid int. date or datetime object") start = datetime(start, 1, 1) if _types.is_number(start) else pd.to_datetime(start) end = datetime(end, 1, 1) if _types.is_number(end) else pd.to_datetime(end) return start, end
def sanitize_dates(start, end): """ Return (datetime_start, datetime_end) tuple """ if is_number(start): # regard int as year start = datetime.datetime(start, 1, 1) start = pd.to_datetime(start) if is_number(end): # regard int as year end = datetime.datetime(end, 1, 1) end = pd.to_datetime(end) if start is not None and end is not None: if start > end: raise Exception("end must be after start") return start, end
def build_address(row: pd.Series) -> str: """Combines multiple columns into a single mailing address value. Columns required are "StreetAddress", "City", "State" and "Zip" """ zip = row["Zip"] if pd_types.is_number(zip): zip = f'{zip:0>5}' return escape_value( f'{row["StreetAddress"]}, {row["City"]}, {row["State"]} {zip}')
def _sanitize_dates(start, end): """ Return (datetime_start, datetime_end) tuple if start is None - default is 2010/01/01 if end is None - default is today """ if is_number(start): # regard int as year start = dt.datetime(start, 1, 1) start = to_datetime(start) if is_number(end): end = dt.datetime(end, 1, 1) end = to_datetime(end) if start is None: start = dt.datetime(2010, 1, 1) if end is None: end = dt.datetime.today() return start, end
def _sanitize_dates(start: typing.Union[None, int], end: typing.Union[None, int]) -> tuple: """ Return (datetime_start, datetime_end) tuple """ if is_number(start): # regard int as year start: datetime.datetime = datetime.datetime(start, 1, 1) start = pd.to_datetime(start) if is_number(end): # regard int as year end: datetime.datetime = datetime.datetime(end, 1, 1) end = pd.to_datetime(end) if start and end: if start > end: raise Exception("end must be after start") return start, end
def _sanitize_dates(start, end): """ Return (timestamp_start, timestamp_end) tuple if start is None - default is 15 years before the current date if end is None - default is today Parameters ---------- start : str, int, date, datetime, Timestamp Desired start date end : str, int, date, datetime, Timestamp Desired end date """ today = dt.date.today() today = to_datetime(today) if is_number(start): # regard int as year start = dt.datetime(start, 1, 1) start = to_datetime(start) if is_number(end): end = dt.datetime(end, 1, 1) end = to_datetime(end) if start is None: # default to 5 years before today start = today - dt.timedelta(days=365 * 15) if end is None: # default to today end = today try: start = to_datetime(start) end = to_datetime(end) except (TypeError, ValueError): raise ValueError("Invalid date format.") if start > end: raise ValueError("start must be an earlier date than end") if start > today or end > today: raise ValueError("Start and end dates must be before current date") return start, end
def partition_data(self, data, inquiry): ''' partitions this dataset into yes and no evaluated datasets of the given inquiry return yes_data, no_data ''' lab = inquiry.label val = inquiry.value if is_number(val): yes_data = data[data[lab] >= val] else: yes_data = data[data[lab] == val] no_data = data.drop(yes_data.index) return yes_data, no_data
def _sanitize_date(obj, default): """转换为日期对象,如果为None则使用默认值。输出datetime.date对象""" if isinstance(obj, pd.Timestamp): return obj.date() # 务必排在dt.date之前 if isinstance(obj, dt.datetime): return obj.date() if isinstance(obj, dt.date): return obj if is_number(obj): return dt.date(obj, 1, 1) if isinstance(obj, str): return pd.to_datetime(obj).date() if obj is None: return default raise ValueError('不能识别的输入日期')
def from_values(cls, initial_value, values=None, closed="left"): """ Construct :class:`Stairs` from :class:`pandas.Series`. Parameters ---------- initial_value : float, default 0 The value of the step function at negative infinity. values : :class:`pandas.Series` The step function values' when approaching the change points from the right closed : {"left", "right"} Indicates whether the half-open intervals comprising the step function should be interpreted as left-closed or right-closed. Returns ------- :class:`Stairs` """ if not isinstance(values, pd.Series) or values.empty: raise ValueError("values must be a not empty Series") if not (is_numeric_dtype(values.index) or is_datetime64_dtype( values.index) or is_timedelta64_dtype(values.index)): warnings.warn("The index of data is not numeric, or time based") if np.isinf(values.index).any(): raise ValueError("Invalid value for Series index") if not is_numeric_dtype(values) or not is_number(initial_value): raise ValueError("Invalid dtype for from_values()") if not values.index.is_monotonic_increasing: raise ValueError("Series index must be monotonic") series_values_inf_mask = np.isinf(values) if series_values_inf_mask.any(): values = values.replace([np.inf], np.nan) warnings.warn( "Infinity values detected and have been converted to NaN") new_instance = cls(closed=closed) new_instance.initial_value = initial_value new_instance._data = values.to_frame("value") new_instance._valid_deltas = False new_instance._valid_values = True return new_instance
def _fit(self, X: pd.Series, y): if not is_numeric_dtype(X) and X.name not in self.categorical_cols: raise ValueError( 'Column {} is not numeric and not in categorical_cols.'.format( X.name)) if X.name in self.categorical_cols: X = self.encode_with_label(X, y) if not self.encode: self.min_[X.name], self.max_[X.name] = X.min(), X.max() X, y = self._drop_na(X, y) min_frac = self.min_frac if is_number( self.min_frac) else self.min_frac[X.name] DT = DecisionTreeClassifier(max_leaf_nodes=self.bins, min_samples_leaf=min_frac, random_state=self.random_state) DT.fit(X.to_frame(), y) return parse_tree(DT.tree_), DT
def get_ans(self, query_data): ''' compares the value of this Inquiry to the value of query_data based on the label query_data is the row of values for which answer to this inquiry is required comparison criteria: for number instances >= comparison, for literal == comparision example: query_data = [255, 0, 0, 'red'] obj_1 = {label='R', value='255'} obj_1.get_ans(query_data) <- query_data['r'] >= value obj_2 = {label='color', value='red'} obj_2.get_ans(query_data) <- query_data['color'] == value ''' if is_number(self.value): return query_data[self.label] >= self.value return query_data[self.label] == self.value
def censor(x, range=(0, 1), only_finite=True): """ Convert any values outside of range to a **NULL** type object. Parameters ---------- x : array_like Values to manipulate range : tuple (min, max) giving desired output range only_finite : bool If True (the default), will only modify finite values. Returns ------- x : array_like Censored array Examples -------- >>> a = [1, 2, np.inf, 3, 4, -np.inf, 5] >>> censor(a, (0, 10)) [1, 2, inf, 3, 4, -inf, 5] >>> censor(a, (0, 10), False) [1, 2, nan, 3, 4, nan, 5] >>> censor(a, (2, 4)) [nan, 2, inf, 3, 4, -inf, nan] Notes ----- All values in ``x`` should be of the same type. ``only_finite`` parameter is not considered for Datetime and Timedelta types. The **NULL** type object depends on the type of values in **x**. - :class:`float` - :py:`float('nan')` - :class:`int` - :py:`float('nan')` - :class:`datetime.datetime` : :py:`np.datetime64(NaT)` - :class:`datetime.timedelta` : :py:`np.timedelta64(NaT)` """ if not len(x): return x py_time_types = (datetime.datetime, datetime.timedelta) np_pd_time_types = (pd.Timestamp, pd.Timedelta, np.datetime64, np.timedelta64) x0 = first_element(x) # Yes, we want type not isinstance if type(x0) in py_time_types: return _censor_with(x, range, 'NaT') if not hasattr(x, 'dtype') and isinstance(x0, np_pd_time_types): return _censor_with(x, range, type(x0)('NaT')) x_array = np.asarray(x) if pdtypes.is_number(x0) and not isinstance(x0, np.timedelta64): null = float('nan') elif com.is_datetime_arraylike(x_array): null = pd.Timestamp('NaT') elif pdtypes.is_datetime64_dtype(x_array): null = np.datetime64('NaT') elif isinstance(x0, pd.Timedelta): null = pd.Timedelta('NaT') elif pdtypes.is_timedelta64_dtype(x_array): null = np.timedelta64('NaT') else: raise ValueError( "Do not know how to censor values of type " "{}".format(type(x0))) if only_finite: try: finite = np.isfinite(x) except TypeError: finite = np.repeat(True, len(x)) else: finite = np.repeat(True, len(x)) if hasattr(x, 'dtype'): outside = (x < range[0]) | (x > range[1]) bool_idx = finite & outside x = x.copy() x[bool_idx] = null else: x = [null if not range[0] <= val <= range[1] and f else val for val, f in zip(x, finite)] return x
def __str__(self): condition = '>=' if is_number(self.value) else '==' return 'Is %s %s %s?' % (str(self.label), condition, str(self.value))
def _fit(self, X, y, **fit_parmas): """ Fit a single feature and return the cutoff points""" self.categorical_cols = self.categorical_cols or [] if not is_numeric_dtype(X) and X.name not in self.categorical_cols: raise ValueError('Column {} is not numeric and not in categorical_cols.'.format(X.name)) y = force_zero_one(y) X, y = make_series(X), make_series(y) # if X is discrete, encode with positive ratio in y if X.name in self.categorical_cols: # the categorical columns will remain unchanged if # we turn off bin_cat_cols if not self.bin_cat_cols: return None X = self.encode_with_label(X, y) # the number of bins is the number of cutoff points minus 1 n_bins = X.nunique() - 1 # if the number of bins is less than `max_bin` for categorical columns then # set the column as a mapping if n_bins < self.max_bin and X.name in self.categorical_cols: # mapping bad rate to encoding group_mapping = {v: i+1 for i, v in enumerate(set(X[X.notnull()]))} return self.discrete_encoding[X.name].map(group_mapping).to_dict() # speed up the process with prebinning if self.prebin and n_bins > self.prebin: if self.prebin_method.lower() == 'tree': min_frac = self.min_frac if is_number(self.min_frac) else self.min_frac[X.name] X, _ = tree_binning(X, y, n=self.prebin, min_frac=min_frac, encode=False, random_state=1024) elif self.prebin_method.lower() == 'equal_freq': X, _ = equal_frequency_binning(X, n=self.prebin, encode=False) else: raise ValueError('Only `tree` and `equal_freq` is supported for prebin_method.') # convert to mapping mapping = y.groupby(X).apply(list).to_dict() # set the overall expected ratio if len(mapping) == 0: return [-np.inf] self.expected_ratio = sum(sum(v) for v in mapping.values()) / sum(len(v) for v in mapping.values()) # if the expected_ratio is 0 or 1 there should be only 1 group and # any not-null value will be encoded into 1 if self.expected_ratio == 0 or self.expected_ratio == 1: return [-np.inf] n_bins = len(mapping) # merge bins based on chi square while n_bins > self.max_bin: mapping = self.merge_chisquare(mapping) n_bins = len(mapping) # merge bins to create mixed label in every bin if self.force_mix_label and n_bins > 1: is_pure = False while not is_pure: mapping, is_pure = self.merge_purity(mapping) # merge bins to keep bins to be monotonic if self.force_monotonic: while len(mapping) > 2 and not self.is_monotonic_post_bin(mapping): # mapping = self.merge_chisquare(mapping) mapping = self.merge_monotonic(mapping) # merge bins to meet the minimum sample size for each interval if self.min_interval_size > 0: if self.min_interval_size <= 1: min_interval_size = self.min_interval_size * X.notnull().sum() else: min_interval_size = self.min_interval_size meet_interval_size = False while not meet_interval_size and len(mapping) > 2: mapping, meet_interval_size = self.merge_interval_size(mapping, min_interval_size) # clean up the cache self._chisquare_cache = dict() return mapping.keys()
def stringify(a): if is_number(a): b = str(int(round(a,0))) elif type(a) == type([]):
def fraglist_clearner(fraglist): for frag in fraglist: n=0 if is_number(frag) == True:
def _to_kv_quantity(value: object) -> str: if pd_types.is_number(value): return f'[{value} KiloVolt]' return ''
def zip_to_dcid(zip: object) -> str: if pd_types.is_number(zip): return f'dcid:zip/{zip:0>5}' return ''
def generate_transitions(df): ''' Remember to initialize a dictionary for the transitions ''' try: CEDP_methods except NameError: CEDP_methods = {} EP = 10 CXP = 4 Time = 5 for idx in range(len(df)): ln = df.loc[idx] inhouse = ln['inhouse'].strip() chemname = ln['name'].strip().capitalize() adduct = ln['Adduct'].strip() Q1 = ln['Q1'].round(5) Q1s = str(int(ln['Q1'].round())) if is_number(ln['Fragments (low)']) == True: fl = [ln['Fragments (low)']] else: fl = [i.strip("?!. ,") for i in ln['Fragments (low)'].split(',')] if is_number(ln['Fragments (High)']) == True: fl = [ln['Fragments (High)']] else: fh = [i.strip("?.! ,") for i in ln['Fragments (High)'].split(',')] best_list = [] ## CE optimizations DP = 50 method_string = '' method_name = f'200706_CEDP_%s_%s_MZ%s_Pos' %(inhouse, adduct, Q1s) ################ for frag in fl[:6]: #print(frag) if is_number(frag) == True: if pd.isna(frag) == True: fl.remove(frag) #print('1', fl) if is_string_dtype(frag) == True: try: float(frag) except: #print('2', fl) fl.remove(frag) else: try: Q3 = float(frag.strip("?. ,!")) except: #print('3', fl) continue else: fl.remove(frag) #print('4', fl) if len(fl) >=1: #print(Q1, frag, pd.isna(frag)) for Q3, CE in make_CE_MRMs(frag, range(10,50, 5)): try: Q3s = str(int(Q3)) except: continue ID = f'%s_%s_%s_%s_DP%s_CE%s' %(inhouse, chemname, Q1s, Q3s, DP, CE) lowfstr = '\t'.join([str(x) for x in (round(Q1,2),round(Q3,3),Time,ID,DP,EP,CE,CXP)]) method_string = method_string + lowfstr + '\n' #print((method_string)) else: pass CEDP_methods[method_name] = method_string ################### for frag in fh[:6]: #print(frag) if is_number(frag) == True: if pd.isna(frag) == True: fh.remove(frag) #print('1', fh) if is_string_dtype(frag) == True: try: float(frag) except: #print('2', fh) fh.remove(frag) else: try: Q3 = float(frag.strip("?. ,!")) except: #print('3', fh) continue else: fh.remove(frag) #print('4', fh) if len(fh) >=1: #print(Q1, frag, pd.isna(frag)) for Q3, CE in make_CE_MRMs(frag, range(40,90, 5)): try: Q3s = str(int(Q3)) except: continue ID = f'%s_%s_%s_%s_DP%s_CE%s' %(inhouse, chemname, Q1s, Q3s, DP, CE) lowfstr = '\t'.join([str(x) for x in (round(Q1,2),round(Q3,3),Time,ID,DP,EP,CE,CXP)]) method_string = method_string + lowfstr + '\n' #print((method_string)) else: pass CEDP_methods[method_name] = method_string ######## if len(fl) >= 1: best_list.append(fl[0]) if len(fh) >= 1: best_list.append(fh[0]) if len(best_list) >= 1: hlswitch = 0 for frag in best_list : if hlswitch >=1 : CE = 50 else: CE = 25 if is_number(frag) == True: if pd.isna(frag) == True: best_list.remove(frag) #print('1', best_list) if is_string_dtype(frag) == True: try: float(frag) except: #print('2', best_list) best_list.remove(frag) else: try: Q3 = float(frag.strip("?. ,!")) except: #print('3', best_list) continue # #else: # best_list.remove(frag) # #print('4', best_list) if len(best_list) >=1: #print(Q1, frag, pd.isna(frag)) for Q3, DP in make_CE_MRMs(frag, range(10, 130, 10)): try: Q3s = str(int(Q3)) except: continue ID = f'%s_%s_%s_%s_DP%s_CE%s' %(inhouse, chemname, Q1s, Q3s, DP, CE) lowfstr = '\t'.join([str(x) for x in (round(Q1,2),round(Q3,3),Time,ID,DP,EP,CE,CXP)]) method_string = method_string + lowfstr + '\n' #print((method_string)) else: pass CEDP_methods[method_name] = method_string return(CEDP_methods)