def combine_backward(df, order, period=30, strict=[]): """ combine_backward(df, order, period=30, strict=[]): 将一组信号按order中的顺序进行组合,在order[i+1]序列的两个1的区间内,匹配order[i]中最靠前的1。 即时间靠前的下层信号会覆盖后面的下层信号。 得到一个集合,集合中每条记录代表一个可行的信号组合。 Input: df: (DataFrame): 一组信号序列,至少包含order中涉及的列 order: (list of string): 按序排好的一组信号的名称,至少两列 period: (int): 从末位信号开始算,一组信号的最长周期。超过该周期仍未凑够一组信号则放弃该条记录。 strict: (list of boolean): 长度为len(order) - 1,strict[i]指示order[i]的信号出现时间是否必须早于order[i+1]的时间。 True必须严格早于;False可以同一天连续触发。 参数默认值为[True]*(len(order)-1),若传入列表长度不足,则尾部用True填充 Output: (DataFrame[order]):返回DataFrame,每条记录(一行)代表一个可行的信号组合,每列中的值为该信号在df中的索引 """ rdf = pd.DataFrame() #将df最后一列所有信号的位置全部放进rdf中 rdf[order[-1]] = pd.Series(np.where(df[order[-1]] == 1)[0]) if len(order) > 1: if len(strict) < len(order) - 1: rest_len = len(order) - 1 - len(strict) strict.extend([True] * rest_len) last_col = rdf[order[-1]] #每个信号的范围只与last_col-period有关,前面信号不对后面信号的范围造成影响 if period is None: range_sr = pd.Series([(0, last_col[i]) if i == 0 \ else (last_col[i-1], last_col[i]) \ for i in range(len(rdf))]) else: range_sr = pd.Series([(max(0,last_col[i]-period),last_col[i]) if i == 0\ else (last_col[i]-period, last_col[i])\ for i in range(len(rdf))]) odr = order.copy() odr.reverse() st = strict.copy() st.reverse() #对每一列信号操作 for i in range(1, len(odr)): #找该行的信号位置 signal_loc = np.where(df[odr[i]] == 1)[0] tmp_sr = range_sr.map(lambda x: _backward(x, signal_loc, st[i-1]), \ na_action='ignore') rdf[odr[i]] = tmp_sr.map(lambda x: x[1]) range_sr = tmp_sr #去掉rdf中包含np.NAN的记录(行) if rdf is not None and len(rdf) > 0: rdf = rdf.drop(np.where(np.isnan(rdf))[0]) rdf = rdf[order].reset_index(drop=True) else: rdf = None return rdf
def _backward(x, signal_loc, strict): #若strict=False即宽松条件,当进入的tuple为(0,0)时,需要让left为-1才能套公式 left = -1 if (x[0] == 0 and not strict) else x[0] right = x[1] if strict: loc = np.where((signal_loc >= left) & (signal_loc < right))[0] else: loc = np.where((signal_loc >= left) & (signal_loc <= right))[0] right = signal_loc[loc.max()] if len(loc) > 0 else np.NAN result = (left, right) if (not np.isnan(right)) else (np.NAN, np.NAN) return result
def ix_nan(self,colnames=None,indices=None): # get the indices based on input. indices=self.getindices(indices) # get the column names over which to iterate colnames=self.getcolnames(colnames) for colname in colnames: #print('XXX',indices) (null,) = np.where(pd.isnan(self.t.loc[indices,colname])) indices = indices[null] #print('YYY',notnull) return(indices)
'long_term_incentive', 'restricted_stock', 'director_fees', 'to_messages', \ 'from_messages', 'norm_from_poi_to_this_person', 'norm_from_this_person_to_poi', \ 'norm_shared_receipt_with_poi', 'from_poi_to_this_person', 'from_this_person_to_poi', 'shared_receipt_with_poi'] figure = plt.figure(figsize=(27, 20)) j = 1 for i, feature1 in enumerate(features_list2[1:]): y_name = features_list2[0] key = feature1 bin_step = 20 all_data = df[[y_name, key]] # Remove NaN values from Age data all_data = all_data[~np.isnan(all_data[key])] # Divide the range of data into bins and count survival rates min_value = all_data[key].min() max_value = all_data[key].max() value_range = max_value - min_value bins = np.arange(min_value, max_value + value_range/bin_step, value_range/bin_step ) y0 = all_data[all_data[y_name] == 0][key].reset_index(drop = True) y1 = all_data[all_data[y_name] == 1][key].reset_index(drop = True) ax = plt.subplot(len(features_list2)/4+1, 4, j) ax.hist(y0, bins = bins, alpha = 0.6, color = 'red', label = 'y0') ax.hist(y1, bins = bins, alpha = 0.6, color = 'green', label = 'y1') ax.set_xlim(bins.min(), bins.max()) ax.set_title(key) # ax.legend(framealpha = 0.8) j+=1