def analyze_samples(trn_samples, tst_samples, n, filename): ''' 分析样本中逾期客户的风险点 Args: trn_samples (Samples): 训练样本 tst_samples (Samples): 测试样本 n (int): 取top n个重要特征 filename (str): 结果文件 ''' x_importances = get_x_importances(trn_samples) top_n_xs = sorted(x_importances.keys(), key=lambda item: x_importances[item])[: n] # 取top n个重要特征 x_indexes = trn_samples.get_x_indexes() tst_pos_samples = Samples(x_indexes, [sample for sample in tst_samples if sample.get_y_pred() == cf.OVERDUE]) # 逾期客户 trn_neg_samples = Samples(x_indexes, [sample for sample in trn_samples if sample.get_y() == cf.NON_OVERDUE]) # 正常客户 trn_neg_avg_X = trn_neg_samples.get_avg_X() trn_neg_std_X = trn_neg_samples.get_std_X() with open(filename, 'w') as outfile: d = {'samples': {'sample': []}} for sample in tst_pos_samples: s = {} s['cust_num'] = sample.get_cust_num() s['protol_nums'] = {'protol_num': sample.get_protol_nums()} s['y_pred'] = sample.get_y_pred() X = sample.get_X() ratios = [] for name in top_n_xs: index = x_indexes[name] x = X[index] avg_x = trn_neg_avg_X[index] std_x = trn_neg_std_X[index] ratio = abs(x - avg_x) / std_x ratios.append([name, x, avg_x, std_x, ratio]) ratios.sort(key=lambda item: item[4], reverse=True) s['X'] = {'x': []} for name, x, avg_x, std_x, ratio in ratios: s['X']['x'].append({'ratio': str(ratio), 'val': str(x), 'avg_val': str(avg_x), 'std_val': str(std_x), '@name': name.decode('utf-8')}) d['samples']['sample'].append(s) xml = xmltodict.unparse(d, pretty=True).encode('utf-8') outfile.write(xml)