def distribution_for_half(self): if self._distribution_half != None: return self._distribution_half bycond = {} for o in self.halfoffer_set.all(): if o.active: cond = map_half_cond_to_internal(o.condition) if not bycond.has_key(cond): bycond[cond] = [] bycond[cond].append(float(o.price)) from stats import quantile dist = {} for k in bycond.keys(): if len(bycond[k]) > 1: min = quantile(bycond[k],0) p25 = quantile(bycond[k],0.25) med = quantile(bycond[k],0.5) p75 = quantile(bycond[k],0.75) max = quantile(bycond[k],1) #print k,"N=%d, %03.2f<< %03.2f - %03.2f - %03.2f >>%03.2f" % (len(bycond[k]),min,p25,med,p75,max) dist[k] = (len(bycond[k]),Decimal("%3.2f" % min), Decimal("%3.2f" % p25), Decimal("%3.2f" % med), Decimal("%3.2f" % p75), Decimal("%3.2f" % max)) else: #print k,"%03.2f" % bycond[k][0] dist[k] = (1,"","",Decimal("%3.2f" % bycond[k][0]),"","") self._distribution_half = dist return dist
def get_seq_feature(seq, seq_name, user_id): # total 11 features if not seq: print('seq is empty!') return df = pd.DataFrame() df[seq_name + '_mean'] = [np.mean(seq)] df[seq_name + '_median'] = [np.median(seq)] df[seq_name + '_max'] = [np.max(seq)] df[seq_name + '_min'] = [np.min(seq)] df[seq_name + '_var'] = [np.var(seq)] df[seq_name + '_std'] = [np.std(seq)] if len(seq) == 1: df[seq_name + '_upquantile'] = seq[0] df[seq_name + '_downquantile'] = 0 else: df[seq_name + '_upquantile'] = [sts.quantile(seq, p=0.75)] df[seq_name + '_downquantile'] = [sts.quantile(seq, p=0.25)] if np.mean(seq) != 0: df[seq_name + '_discrete'] = [np.std(seq) / np.mean(seq)] else: df[seq_name + '_discrete'] = [np.NaN] try: df[seq_name + 'skew'] = [sts.skewness(seq)] except: df[seq_name + 'skew'] = [np.NaN] try: df[seq_name + 'kurt'] = [sts.kurtosis(seq)] except: df[seq_name + 'kurt'] = [np.NaN] df['user_id'] = [user_id] return df
def data_description(index, start, end): returns = download_data.get_returns(index, start, end) print('个数:', len(returns)) print('平均值:', np.mean(returns)) print('中位数:', np.median(returns)) print('上四分位数', sts.quantile(returns, p=0.25)) print('下四分位数', sts.quantile(returns, p=0.75)) #离散趋势的度量 print('最大值:', np.max(returns)) print('最小值:', np.min(returns)) print('极差:', np.max(returns) - np.min(returns)) print('四分位差', sts.quantile(returns, p=0.75) - sts.quantile(returns, p=0.25)) print('标准差:', np.std(returns)) print('方差:', np.var(returns)) print('离散系数:', np.std(returns) / np.mean(returns)) #偏度与峰度的度量 print('偏度:', sts.skewness(returns)) print('峰度:', sts.kurtosis(returns)) print(st.kstest(returns, 'norm')) length = len(returns) sns.distplot(returns, bins=100, label='Empirical') sns.plt.legend() sns.plt.title('Empirical') sns.plt.show()
def extend_feature(scores): """ 特征构造 Args: scores: 原始滑动窗口获得的特征 Returns: 返回基于滑动窗口特征增加的统计特征 """ features = scores features.append(np.sum(scores)) #总数 features.append(np.mean(scores)) #平均数 features.append(np.median(scores)) #中位数 # features.append(sts.mode(scores)) #众数 features.append(sts.quantile(scores, p=0.25)) #上四分位 features.append(sts.quantile(scores, p=0.75)) #上七分位 features.append(np.max(scores)) #最大值 features.append(np.min(scores)) #最小值 features.append(np.max(scores) - np.min(scores)) #极差 features.append( sts.quantile(scores, p=0.75) - sts.quantile(scores, p=0.25)) #四分位差 features.append(np.var(scores)) #方差 features.append(np.std(scores) / np.mean(scores)) #离散系数 features.append(sts.skewness(scores)) #偏度 features.append(sts.kurtosis(scores)) #峰度 return features
def test_quantile(self): self.assertEqual(stats.quantile([1, 2, 3, 4, 5], 0.2), 2)
def test_quantile_wrong_type(self): with self.assertRaises(TypeError) as raised_exception: stats.quantile([2, 3, 4], 3) self.assertEqual(raised_exception.exception.args[0], "p must be float")
def test_quantile_wrong_range_p(self): with self.assertRaises(ValueError) as raised_exception: stats.quantile([2, 3, 4], 3.0) self.assertEqual(raised_exception.exception.args[0], "p must be in range (0,1)")
#!/usr/bin/python26 # encoding=utf-8 """ 异常值检测 使用算法:Tukey's Test """ import numpy as np import stats as sts list = [1, 4, 8, 90, 98, 44, 35, 56, 2, 41, 11, 24, 23, 45, 500, 150] print(list) # 求四分位数 print('下四分位数:', sts.quantile(list, p=0.25)) print('上四分位数:', sts.quantile(list, p=0.75)) q1 = sts.quantile(list, p=0.25) q3 = sts.quantile(list, p=0.75) # k=1.5 中度异常 k1 = 1.5 g_min_m = q1 - k1 * (q3 - q1) g_max_m = q3 + k1 * (q3 - q1) # k=3 重度异常 k2 = 3 g_min_b = q1 - k2 * (q3 - q1) g_max_b = q3 + k2 * (q3 - q1) # g_min_b, g_min_m, g_max_m, g_max_b
import numpy as np import stats as sts data = [1, 2, 2, 3] #集中趋势的度量 print('求和:', np.sum(data)) print('个数:', len(data)) print('平均值:', np.mean(data)) print('中位数:', np.median(data)) print('众数:', sts.mode(data)) print('上四分位数', sts.quantile(data, p=0.25)) print('下四分位数', sts.quantile(data, p=0.75)) #离散趋势的度量 print('最大值:', np.max(data)) print('最小值:', np.min(data)) print('极差:', np.max(data) - np.min(data)) print('四分位差', sts.quantile(data, p=0.75) - sts.quantile(data, p=0.25)) print('标准差:', np.std(data)) print('方差:', np.var(data)) print('变异系数:', np.std(data) / np.mean(data)) #偏度与峰度的度量 print('偏度:', sts.skewness(data)) print('峰度:', sts.kurtosis(data)) # 随机生成两个样本 x = np.random.randint(0, 9, 1000) y = np.random.randint(0, 9, 1000) # 计算平均值 mx = x.mean() my = y.mean()
def main(): usage = 'usage: %prog [options] <roc_dir>' parser = OptionParser(usage) parser.add_option('-t', dest='targets_file') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide ROC points file') else: roc_dir = args[0] # read target labels if options.targets_file: target_labels = [ line.split()[0] for line in open(options.targets_file) ] else: target_labels = [ 'Target %d' % (ti + 1) for ti in range(len(glob.glob('%s/roc*.txt' % roc_dir))) ] ####################################################### # make all ROC plots ####################################################### target_fpr = [] target_tpr = [] for roc_file in glob.glob('%s/roc*.txt' % roc_dir): ti = int(roc_file[roc_file.find('roc') + 3:-4]) - 1 target_fpr.append([]) target_tpr.append([]) for line in open(roc_file): a = line.split() target_fpr[-1].append(float(a[0])) target_tpr[-1].append(float(a[1])) plt.figure(figsize=(6, 6)) plt.scatter(target_fpr[-1], target_tpr[-1], s=8, linewidths=0, c=sns_colors[0]) plt.title(target_labels[ti]) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.xlim((0, 1)) plt.ylim((0, 1)) plt.grid(True) plt.tight_layout() out_pdf = '%s.pdf' % os.path.splitext(roc_file)[0] plt.savefig(out_pdf) plt.close() ####################################################### # multiple ROC curve plot ####################################################### # read AUCs target_aucs = [ float(line.split()[1]) for line in open('%s/aucs.txt' % roc_dir) ] # choose cells auc_targets = [(target_aucs[ti], ti) for ti in range(len(target_aucs))] auc_targets.sort() fig_quants = [0.05, .33, 0.5, .67, .95] auc_target_quants = quantile(auc_targets, fig_quants) # plot sns.set(style='white', font_scale=1.2) plt.figure(figsize=(6, 6)) si = 0 for auc, ti in auc_target_quants: target_label = '%-9s AUC: %.3f' % (target_labels[ti], target_aucs[ti]) plt.plot(target_fpr[ti], target_tpr[ti], c=sns_colors[si], label=target_label, linewidth=2.5, alpha=0.8) si += 1 plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.xlim((0, 1)) plt.ylim((0, 1)) ax = plt.gca() ax.xaxis.label.set_fontsize(17) ax.yaxis.label.set_fontsize(17) map(lambda xl: xl.set_fontsize(13), ax.get_xticklabels()) map(lambda yl: yl.set_fontsize(13), ax.get_yticklabels()) ax.grid(True, linestyle=':') plt.tight_layout() matplotlib.rcParams.update({'font.family': 'monospace'}) plt.legend(loc=4, fontsize=12) plt.savefig('%s/range.pdf' % roc_dir) plt.close()
data[:, :, 65] = walking_12[0:480, 1:7] data[:, :, 66] = walking_13[0:480, 1:7] data[:, :, 67] = walking_14[0:480, 1:7] data[:, :, 68] = walking_15[0:480, 1:7] y = [[0] for row in range(480)] dataset = np.zeros((69, 7, 6)) for a in range(6): for i in range(69): dataset[i, 0, a] = data[:, a, i].min() dataset[i, 1, a] = data[:, a, i].max() dataset[i, 2, a] = data[:, a, i].mean() dataset[i, 3, a] = np.median(data[:, a, i]) dataset[i, 4, a] = data[:, a, i].std() y = data[:, a, i].tolist() dataset[i, 5, a] = sts.quantile(y, p=0.25) dataset[i, 6, a] = sts.quantile(y, p=0.75) scatter_matrix = np.zeros((69, 3, 3)) scatter_matrix[:, :, 0] = dataset[:, 0:3, 0] scatter_matrix[:, :, 1] = dataset[:, 0:3, 1] scatter_matrix[:, :, 2] = dataset[:, 0:3, 5] fig = plt.figure() plt.title('Scatter Plots') # ax1 = fig.add_subplot(991) ax1.scatter(scatter_matrix[0:9, 0, 0].tolist(), scatter_matrix[0:9, 0, 0].tolist(), c='b', marker='.')
def main(): usage = 'usage: %prog [options] <gtf> <diff>' parser = OptionParser(usage) parser.add_option('-o', dest='out_dir', default='te_diff_regress', help='Output directory to print regression summaries [Default: %default]') parser.add_option('-c', dest='scale', default=1, type='float', help='Plot scale [Default: %default]') parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff'%os.environ['MASK']) parser.add_option('-r', dest='orientation', default=False, action='store_true', help='Split TEs by orientation [Default: %default]') parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]') parser.add_option('-s', dest='spread_factor', default=None, type='float', help='Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]') parser.add_option('-l', dest='spread_lower', default=None, type='float', help='Allow multiplicative factor between median and shortest transcripts [Defafult: %default]') parser.add_option('-u', dest='spread_upper', default=None, type='float', help='Allow multiplicative factor between median and longest transcripts [Defafult: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide .gtf and .diff files') else: ref_gtf = args[0] diff_file = args[1] # make output directory if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.spread_factor or options.spread_lower or options.spread_upper: # filter for similar length if options.spread_factor: options.spread_lower = math.sqrt(options.spread_factor) options.spread_upper = math.sqrt(options.spread_factor) spread_gtf = '%s/spread_factor.gtf' % options.out_dir gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower, verbose=True) ref_gtf = spread_gtf # hash genes -> TEs -> occurence num gene_te_num = te.hash_genes_repeats_num(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=options.orientation) # hash diffs stats gene_diffs = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input') table_lines = [] pvals = [] for spair in gene_diffs: sample1, sample2 = spair gene_list = list(set(gene_te_num.keys()) & set(gene_diffs[spair].keys())) for fam in count_tes: if options.orientation: orients = ['+','-'] else: orients = ['+'] for orient in orients: # hash diff values by TE count count_diff = [] for gene_id in gene_diffs[spair]: if options.orientation: count = gene_te_num.get(gene_id,{}).get(('*',fam,orient), 0) else: count = gene_te_num.get(gene_id,{}).get(('*',fam), 0) while count >= len(count_diff): count_diff.append([]) count_diff[count].append(gene_diffs[spair][gene_id]) df = {'TEs':[], 'stat_low':[], 'stat_mid':[], 'stat_hi':[]} for c in range(len(count_diff)): if len(count_diff[c]) > 12: stat_low, stat_mid, stat_hi = stats.quantile(count_diff[c], [.25, .5, .75]) df['TEs'].append(c) df['stat_low'].append(stat_low) df['stat_mid'].append(stat_mid) df['stat_hi'].append(stat_hi) else: break if len(df['TEs']) > 1: fam_plot = fam[fam.find('/')+1:] if options.orientation: out_pdf = '%s/%s-%s_%s_%s.pdf' % (options.out_dir, sample1, sample2, fam_plot, orient) out_df = '%s/%s-%s_%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot, orient) else: out_pdf = '%s/%s-%s_%s.pdf' % (options.out_dir, sample1, sample2, fam_plot) out_df = '%s/%s-%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot) ggplot.plot('%s/te_diff_count.r' % os.environ['RDIR'], df, [out_pdf, options.scale], df_file=out_df) if options.spread_factor or options.spread_lower or options.spread_upper: os.remove(spread_gtf)
import numpy as np import stats as sts a = [31, 24, 23, 25, 14, 25, 13, 12, 14, 23, 32, 34, 43, 41, 21, 23, 26, 26, 34, 42, 43, 25, 24, 23, 24, 44, 23, 14, 52,32, 42, 44, 35, 28, 17, 21, 32, 42, 12, 34] scores=np.array(a) print('總合為:',np.sum(scores)) print('筆數為:',len(scores)) print('平均值為:',np.mean(scores)) print('中位數為:',np.median(scores)) print('眾數為:',sts.mode(scores)) print('上四分位數為',sts.quantile(scores,p=0.25)) print('下四分位數為',sts.quantile(scores,p=0.75)) print('最大值:',np.max(scores)) print('最小值:',np.min(scores)) print('全距:',np.ptp(scores)) print('標準差:',np.std(scores)) print('變異數:',np.var(scores)) print('離散係數:',np.std(scores)/np.mean(scores)) print('偏態係數:',sts.skewness(scores)) print('峰態係數:',sts.kurtosis(scores))
def main(): usage = 'usage: %prog [options] <model_file>' parser = OptionParser(usage) parser.add_option('-c', dest='center_dist', default=10, type='int', help='Distance between the motifs and sequence center [Default: %default]') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-g', dest='cuda', default=False, action='store_true', help='Run on the GPGPU [Default: %default]') parser.add_option('-l', dest='seq_length', default=600, type='int', help='Sequence length [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide Basset model file') else: model_file = args[0] out_targets = [int(ti) for ti in options.targets.split(',')] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) # torch options cuda_str = '' if options.cuda: cuda_str = '-cuda' ################################################################# # place filter consensus motifs ################################################################# # determine filter consensus motifs filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str) seqs_1hot = [] num_filters = len(filter_consensus) # num_filters = 40 filter_len = filter_consensus[0].shape[1] # position the motifs left_i = options.seq_length/2 - options.center_dist - filter_len right_i = options.seq_length/2 + options.center_dist ns_1hot = np.zeros((4,options.seq_length)) + 0.25 # ns_1hot = np.zeros((4,options.seq_length)) # for i in range(options.seq_length): # nt_i = random.randint(0,3) # ns_1hot[nt_i,i] = 1 for i in range(num_filters): for j in range(num_filters): # copy the sequence of N's motifs_seq = np.copy(ns_1hot) # write them into the one hot coding motifs_seq[:,left_i:left_i+filter_len] = filter_consensus[i] motifs_seq[:,right_i:right_i+filter_len] = filter_consensus[j] # save seqs_1hot.append(motifs_seq) # make a full array seqs_1hot = np.array(seqs_1hot) # reshape for spatial seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,options.seq_length)) ################################################################# # place filter consensus motifs ################################################################# # save to HDF5 seqs_file = '%s/motif_seqs.h5' % options.out_dir h5f = h5py.File(seqs_file, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # predict scores scores_file = '%s/motif_seqs_scores.h5' % options.out_dir torch_cmd = 'th basset_place2_predict.lua %s %s %s %s' % (cuda_str, model_file, seqs_file, scores_file) subprocess.call(torch_cmd, shell=True) # load in scores hdf5_in = h5py.File(scores_file, 'r') motif_seq_scores = np.array(hdf5_in['scores']) hdf5_in.close() ################################################################# # analyze ################################################################# for ti in out_targets: ################################################################# # compute pairwise expectations ################################################################# # X = np.zeros((motif_seq_scores.shape[0],num_filters)) # xi = 0 # for i in range(num_filters): # for j in range(num_filters): # X[xi,i] += 1 # X[xi,j] += 1 # xi += 1 X = np.zeros((motif_seq_scores.shape[0],2*num_filters)) xi = 0 for i in range(num_filters): for j in range(num_filters): X[xi,i] += 1 X[xi,num_filters+j] += 1 xi += 1 # fit model model = BayesianRidge() model.fit(X, motif_seq_scores[:,ti]) # predict pairwise expectations motif_seq_preds = model.predict(X) print model.score(X, motif_seq_scores[:,ti]) # print filter coefficients coef_out = open('%s/coefs_t%d.txt' % (options.out_dir,ti), 'w') for i in range(num_filters): print >> coef_out, '%3d %6.2f' % (i,model.coef_[i]) coef_out.close() ################################################################# # normalize pairwise predictions ################################################################# filter_interaction = np.zeros((num_filters,num_filters)) table_out = open('%s/table_t%d.txt' % (options.out_dir,ti), 'w') si = 0 for i in range(num_filters): for j in range(num_filters): filter_interaction[i,j] = motif_seq_scores[si,ti] - motif_seq_preds[si] cols = (i, j, motif_seq_scores[si,ti], motif_seq_preds[si], filter_interaction[i,j]) print >> table_out, '%3d %3d %6.3f %6.3f %6.3f' % cols si += 1 table_out.close() scores_abs = abs(filter_interaction.flatten()) max_score = stats.quantile(scores_abs, .999) print 'Limiting scores to +-%f' % max_score filter_interaction_max = np.zeros((num_filters, num_filters)) for i in range(num_filters): for j in range(num_filters): filter_interaction_max[i,j] = np.min([filter_interaction[i,j], max_score]) filter_interaction_max[i,j] = np.max([filter_interaction_max[i,j], -max_score]) # plot heat map plt.figure() sns.heatmap(filter_interaction_max, xticklabels=False, yticklabels=False) plt.savefig('%s/heat_t%d.pdf' % (options.out_dir,ti))
def main(): usage = 'usage: %prog [options] <gtf> <diff>' parser = OptionParser(usage) parser.add_option( '-o', dest='out_dir', default='te_diff_regress', help= 'Output directory to print regression summaries [Default: %default]') parser.add_option('-c', dest='scale', default=1, type='float', help='Plot scale [Default: %default]') parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK']) parser.add_option('-r', dest='orientation', default=False, action='store_true', help='Split TEs by orientation [Default: %default]') parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]') parser.add_option( '-s', dest='spread_factor', default=None, type='float', help= 'Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]' ) parser.add_option( '-l', dest='spread_lower', default=None, type='float', help= 'Allow multiplicative factor between median and shortest transcripts [Defafult: %default]' ) parser.add_option( '-u', dest='spread_upper', default=None, type='float', help= 'Allow multiplicative factor between median and longest transcripts [Defafult: %default]' ) (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide .gtf and .diff files') else: ref_gtf = args[0] diff_file = args[1] # make output directory if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.spread_factor or options.spread_lower or options.spread_upper: # filter for similar length if options.spread_factor: options.spread_lower = math.sqrt(options.spread_factor) options.spread_upper = math.sqrt(options.spread_factor) spread_gtf = '%s/spread_factor.gtf' % options.out_dir gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower, verbose=True) ref_gtf = spread_gtf # hash genes -> TEs -> occurence num gene_te_num = te.hash_genes_repeats_num(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=options.orientation) # hash diffs stats gene_diffs = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input') table_lines = [] pvals = [] for spair in gene_diffs: sample1, sample2 = spair gene_list = list( set(gene_te_num.keys()) & set(gene_diffs[spair].keys())) for fam in count_tes: if options.orientation: orients = ['+', '-'] else: orients = ['+'] for orient in orients: # hash diff values by TE count count_diff = [] for gene_id in gene_diffs[spair]: if options.orientation: count = gene_te_num.get(gene_id, {}).get( ('*', fam, orient), 0) else: count = gene_te_num.get(gene_id, {}).get(('*', fam), 0) while count >= len(count_diff): count_diff.append([]) count_diff[count].append(gene_diffs[spair][gene_id]) df = {'TEs': [], 'stat_low': [], 'stat_mid': [], 'stat_hi': []} for c in range(len(count_diff)): if len(count_diff[c]) > 12: stat_low, stat_mid, stat_hi = stats.quantile( count_diff[c], [.25, .5, .75]) df['TEs'].append(c) df['stat_low'].append(stat_low) df['stat_mid'].append(stat_mid) df['stat_hi'].append(stat_hi) else: break if len(df['TEs']) > 1: fam_plot = fam[fam.find('/') + 1:] if options.orientation: out_pdf = '%s/%s-%s_%s_%s.pdf' % (options.out_dir, sample1, sample2, fam_plot, orient) out_df = '%s/%s-%s_%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot, orient) else: out_pdf = '%s/%s-%s_%s.pdf' % ( options.out_dir, sample1, sample2, fam_plot) out_df = '%s/%s-%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot) ggplot.plot('%s/te_diff_count.r' % os.environ['RDIR'], df, [out_pdf, options.scale], df_file=out_df) if options.spread_factor or options.spread_lower or options.spread_upper: os.remove(spread_gtf)
def main(): usage = "usage: %prog [options] <roc_dir>" parser = OptionParser(usage) parser.add_option("-t", dest="targets_file") (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide ROC points file") else: roc_dir = args[0] # read target labels if options.targets_file: target_labels = [line.split()[0] for line in open(options.targets_file)] else: target_labels = ["Target %d" % (ti + 1) for ti in range(len(glob.glob("%s/roc*.txt" % roc_dir)))] ####################################################### # make all ROC plots ####################################################### target_fpr = [] target_tpr = [] for roc_file in glob.glob("%s/roc*.txt" % roc_dir): ti = int(roc_file[roc_file.find("roc") + 3 : -4]) - 1 target_fpr.append([]) target_tpr.append([]) for line in open(roc_file): a = line.split() target_fpr[-1].append(float(a[0])) target_tpr[-1].append(float(a[1])) plt.figure(figsize=(6, 6)) plt.scatter(target_fpr[-1], target_tpr[-1], s=8, linewidths=0, c=sns_colors[0]) plt.title(target_labels[ti]) plt.xlabel("False positive rate") plt.ylabel("True positive rate") plt.xlim((0, 1)) plt.ylim((0, 1)) plt.grid(True) plt.tight_layout() out_pdf = "%s.pdf" % os.path.splitext(roc_file)[0] plt.savefig(out_pdf) plt.close() ####################################################### # multiple ROC curve plot ####################################################### # read AUCs target_aucs = [float(line.split()[1]) for line in open("%s/aucs.txt" % roc_dir)] # choose cells auc_targets = [(target_aucs[ti], ti) for ti in range(len(target_aucs))] auc_targets.sort() fig_quants = [0.05, 0.33, 0.5, 0.67, 0.95] auc_target_quants = quantile(auc_targets, fig_quants) # plot sns.set(style="white", font_scale=1.2) plt.figure(figsize=(6, 6)) si = 0 for auc, ti in auc_target_quants: target_label = "%-9s AUC: %.3f" % (target_labels[ti], target_aucs[ti]) plt.plot(target_fpr[ti], target_tpr[ti], c=sns_colors[si], label=target_label, linewidth=2.5, alpha=0.8) si += 1 plt.xlabel("False positive rate") plt.ylabel("True positive rate") plt.xlim((0, 1)) plt.ylim((0, 1)) ax = plt.gca() ax.xaxis.label.set_fontsize(17) ax.yaxis.label.set_fontsize(17) map(lambda xl: xl.set_fontsize(13), ax.get_xticklabels()) map(lambda yl: yl.set_fontsize(13), ax.get_yticklabels()) ax.grid(True, linestyle=":") plt.tight_layout() matplotlib.rcParams.update({"font.family": "monospace"}) plt.legend(loc=4, fontsize=12) plt.savefig("%s/range.pdf" % roc_dir) plt.close()
import numpy as np import stats as sts scares = [ 31, 24, 23, 25, 14, 25, 13, 12, 14, 23, 32, 34, 43, 41, 21, 23, 26, 26, 34, 42, 43, 25, 24, 23, 24, 44, 23, 14, 52, 32, 42, 44, 35, 28, 17, 21, 32, 42, 12, 34 ] print('求和:', np.sum(scares)) print('個數:', len(scares)) print('平均值:', np.mean(scares)) print('中位數:', np.median(scares)) print('眾數:', sts.mode(scares)) print('上四分位數:', sts.quantile(scares, p=0.25)) print('下四分位數:', sts.quantile(scares, p=0.75)) print('最大值:', np.max(scares)) print('最小值:', np.min(scares)) print('極差:', np.std(scares)) print('四分位數:', sts.quantile(scares, p=0.75), sts.quantile(scares, p=0.25)) print('標準差:', np.std(scares)) print('方差', np.var(scares)) print('離散係數', np.std(scares) / np.mean(scares)) print('遍度:', sts.skewness(scares)) print('峰度:', sts.kurtosis(scares))
def quantile2(self, data): print('下四分位数:', sts.quantile(data, p=0.75))
print("\n\n") print("*** Test Module <stats> ***") A = [1, 3, 5, 7, 9, 2, 3, 4, 4, 4, 6, 8, 10, 13, 15, 17] print("vector A = ", A) print("sorted A = ", sorted(A)) mean = st.mean(A) print("A's mean = ", mean) median = st.median(A) print("A's median = ", median) quantile = st.quantile(A, 0.2) print("A's 20% quantile = ", quantile) quantile = st.quantile(A, 0.9) print("A's 90% quantile = ", quantile) mode = st.mode(A) print("A's mode = ", mode) data_range = st.data_range(A) print("A's range = ", data_range) variance = st.variance(A) print("A's variance = ", variance) standard_deviation = st.standard_deviation(A)
def interquartile_range(self, data): print('四分位差:', sts.quantile(data, p=0.75) - sts.quantile(data, p=0.25))
print('方差', df['身高'].var()) print('标准差', df['身高'].std()) print('极差', df['身高'].max() - df['身高'].min()) print('偏度', df['身高'].skew()) print('峰度', df['身高'].kurt()) import numpy as np import stats as sts scores = [1, 2, 2, 2, 5] #集中趋势的度量 print('求和:', np.sum(scores)) print('个数:', len(scores)) print('平均值:', np.mean(scores)) print('中位数:', np.median(scores)) print('众数:', sts.mode(scores)) print('上四分位数', sts.quantile(scores, p=0.25)) print('下四分位数', sts.quantile(scores, p=0.75)) #离散趋势的度量 print('最大值:', np.max(scores)) print('最小值:', np.min(scores)) print('极差:', np.max(scores) - np.min(scores)) print('四分位差', sts.quantile(scores, p=0.75) - sts.quantile(scores, p=0.25)) print('标准差:', np.std(scores)) print('方差:', np.var(scores)) print('离散系数:', np.std(scores) / np.mean(scores)) #偏度与峰度的度量 print('偏度:', sts.skewness(scores)) print('峰度:', sts.kurtosis(scores))
def quantile1(self, data): print('上四分位数:', sts.quantile(data, p=0.25))