Beispiel #1
0
def combine_backward(df, order, period=30, strict=[]):
    """
    combine_backward(df, order, period=30, strict=[]):

    将一组信号按order中的顺序进行组合,在order[i+1]序列的两个1的区间内,匹配order[i]中最靠前的1。
    即时间靠前的下层信号会覆盖后面的下层信号。
    得到一个集合,集合中每条记录代表一个可行的信号组合。

    Input:
        df: (DataFrame): 一组信号序列,至少包含order中涉及的列

        order: (list of string): 按序排好的一组信号的名称,至少两列

        period: (int): 从末位信号开始算,一组信号的最长周期。超过该周期仍未凑够一组信号则放弃该条记录。

        strict: (list of boolean): 长度为len(order) - 1,strict[i]指示order[i]的信号出现时间是否必须早于order[i+1]的时间。
                                True必须严格早于;False可以同一天连续触发。
                                参数默认值为[True]*(len(order)-1),若传入列表长度不足,则尾部用True填充

    Output:
        (DataFrame[order]):返回DataFrame,每条记录(一行)代表一个可行的信号组合,每列中的值为该信号在df中的索引

    """
    rdf = pd.DataFrame()
    #将df最后一列所有信号的位置全部放进rdf中
    rdf[order[-1]] = pd.Series(np.where(df[order[-1]] == 1)[0])
    if len(order) > 1:
        if len(strict) < len(order) - 1:
            rest_len = len(order) - 1 - len(strict)
            strict.extend([True] * rest_len)
        last_col = rdf[order[-1]]
        #每个信号的范围只与last_col-period有关,前面信号不对后面信号的范围造成影响
        if period is None:
            range_sr = pd.Series([(0, last_col[i]) if i == 0 \
                        else (last_col[i-1], last_col[i]) \
                        for i in range(len(rdf))])
        else:
            range_sr = pd.Series([(max(0,last_col[i]-period),last_col[i]) if i == 0\
                        else (last_col[i]-period, last_col[i])\
                        for i in range(len(rdf))])
        odr = order.copy()
        odr.reverse()
        st = strict.copy()
        st.reverse()
        #对每一列信号操作
        for i in range(1, len(odr)):
            #找该行的信号位置
            signal_loc = np.where(df[odr[i]] == 1)[0]
            tmp_sr = range_sr.map(lambda x: _backward(x, signal_loc, st[i-1]), \
                                  na_action='ignore')
            rdf[odr[i]] = tmp_sr.map(lambda x: x[1])
            range_sr = tmp_sr
    #去掉rdf中包含np.NAN的记录(行)
    if rdf is not None and len(rdf) > 0:
        rdf = rdf.drop(np.where(np.isnan(rdf))[0])
        rdf = rdf[order].reset_index(drop=True)
    else:
        rdf = None
    return rdf
Beispiel #2
0
def _backward(x, signal_loc, strict):
    #若strict=False即宽松条件,当进入的tuple为(0,0)时,需要让left为-1才能套公式
    left = -1 if (x[0] == 0 and not strict) else x[0]
    right = x[1]
    if strict:
        loc = np.where((signal_loc >= left) & (signal_loc < right))[0]
    else:
        loc = np.where((signal_loc >= left) & (signal_loc <= right))[0]
    right = signal_loc[loc.max()] if len(loc) > 0 else np.NAN
    result = (left, right) if (not np.isnan(right)) else (np.NAN, np.NAN)
    return result
Beispiel #3
0
 def ix_nan(self,colnames=None,indices=None):
     # get the indices based on input.
     indices=self.getindices(indices)
     
     # get the column names over which to iterate
     colnames=self.getcolnames(colnames)
     
     for colname in colnames:
         #print('XXX',indices)
         (null,) = np.where(pd.isnan(self.t.loc[indices,colname]))
         indices = indices[null]
         #print('YYY',notnull)
     return(indices)
Beispiel #4
0
'long_term_incentive', 'restricted_stock', 'director_fees', 'to_messages', \
 'from_messages', 'norm_from_poi_to_this_person', 'norm_from_this_person_to_poi', \
 'norm_shared_receipt_with_poi', 'from_poi_to_this_person', 'from_this_person_to_poi', 'shared_receipt_with_poi']


figure = plt.figure(figsize=(27, 20))
j = 1
for i, feature1 in enumerate(features_list2[1:]):
    

    y_name = features_list2[0]
    key = feature1
    bin_step = 20
    all_data = df[[y_name, key]]    
    # Remove NaN values from Age data
    all_data = all_data[~np.isnan(all_data[key])]        
    # Divide the range of data into bins and count survival rates
    min_value = all_data[key].min()
    max_value = all_data[key].max()
    value_range = max_value - min_value
    bins = np.arange(min_value, max_value +  value_range/bin_step, value_range/bin_step )
    y0 = all_data[all_data[y_name] == 0][key].reset_index(drop = True)
    y1 = all_data[all_data[y_name] == 1][key].reset_index(drop = True)
    ax = plt.subplot(len(features_list2)/4+1,  4, j)
    ax.hist(y0, bins = bins, alpha = 0.6, color = 'red', label = 'y0')
    ax.hist(y1, bins = bins, alpha = 0.6, color = 'green', label = 'y1')
    ax.set_xlim(bins.min(), bins.max())
    ax.set_title(key)
#    ax.legend(framealpha = 0.8)
    j+=1