def adjudicate_BAF(metrics, labeler, name): # Deletions testable = metrics.loc[(metrics.svtype == 'DEL') & (metrics.svsize >= 5000)] trainable = testable.loc[(testable.poor_region_cov < 0.3) & ~testable.chrom.isin(ALLOSOMES) & ~testable.is_outlier_specific] trainable['label'] = labeler.label(trainable) trainable.to_csv('{0}_DEL_trainable.txt'.format(name), index=False, sep='\t') testable.to_csv('{0}_DEL_testable.txt'.format(name), index=False, sep='\t') features = 'BAF_snp_ratio BAF_del_loglik'.split() cutoffs = {'indep': ['BAF_snp_ratio'], 'dep': ['BAF_del_loglik']} del_cutoffs = rf_classify(metrics, trainable, testable, features, labeler, cutoffs, name) # Duplications testable = metrics.loc[(metrics.svtype == 'DUP') & (metrics.svsize >= 5000)] trainable = testable.loc[(testable.poor_region_cov < 0.3) & ~testable.chrom.isin(ALLOSOMES) & ~testable.is_outlier_specific] trainable['label'] = labeler.label(trainable) trainable.to_csv('{0}_DUP_trainable.txt'.format(name), index=False, sep='\t') testable.to_csv('{0}_DUP_testable.txt'.format(name), index=False, sep='\t') features = 'BAF_KS_stat BAF_KS_log_pval'.split() cutoffs = {'indep': ['BAF_KS_stat'], 'dep': ['BAF_KS_log_pval']} dup_cutoffs = rf_classify(metrics, trainable, testable, features, labeler, cutoffs, name) # Combine cutoffs del_cutoffs['svtype'] = 'DEL' dup_cutoffs['svtype'] = 'DUP' cutoffs = pd.concat([del_cutoffs, dup_cutoffs]).reset_index() cutoffs.to_csv('{0}_cutoffs.txt'.format(name), index=False, sep='\t') cutoffs['test'] = 'BAF' cutoffs['max_svsize'] = np.nan cutoffs['min_svsize'] = 5000 cutoffs['algtype'] = 'any' return cutoffs
def adjudicate_SR1(metrics): testable = metrics.loc[~metrics.name.str.contains('_depth_')] trainable = testable.loc[(testable.poor_region_cov < 0.3) & ~testable.chrom.isin(ALLOSOMES) & ~testable.is_outlier_specific] features = ['SR_sum_log_pval', 'SR_sum_bg_frac'] cutoffs = {'indep': ['SR_sum_log_pval'], 'dep': ['SR_sum_bg_frac']} labeler = labelers.SR1TrainingLabeler() trainable['label'] = labeler.label(trainable) trainable.to_csv('SR1_trainable.txt', index=False, sep='\t') cutoffs = rf_classify(metrics, trainable, testable, features, labeler, cutoffs, 'SR1_prob') cutoffs.to_csv('SR1_cutoffs.txt', index=False, sep='\t') cutoffs['test'] = 'SR1' cutoffs['svtype'] = 'CNV' cutoffs['algtype'] = 'PESR' return cutoffs
def adjudicate_RD(metrics): features = ["RD_Median_Separation", "RD_log_pval", "RD_log_2ndMaxP"] cutoff_features = { 'indep': ['RD_log_pval', 'RD_Median_Separation'], 'dep': ['RD_log_2ndMaxP'] } labeler = labelers.RDTrainingLabeler() cutoff_dfs = [] # PE/SR >1 kb testable = metrics.loc[~metrics.name.str.contains('_depth_') & (metrics.svsize >= 1000)] trainable = testable.loc[(testable.svsize >= 5000) & (testable.poor_region_cov < 0.3) & ~testable.chrom.isin(ALLOSOMES) & ~testable.is_outlier_specific] testable.to_csv('RD_pesr_gt5kb_testable.txt', index=False, sep='\t') trainable['label'] = labeler.label(trainable) trainable.to_csv('RD_pesr_gt5kb_trainable.txt', index=False, sep='\t') cutoffs = rf_classify(metrics, trainable, testable, features, labeler, cutoff_features, 'RD_prob') cutoff_dfs.append(cutoffs) cutoff_dfs[0]['algtype'] = 'PESR' cutoff_dfs[0]['max_svsize'] = np.nan cutoff_dfs[0]['min_svsize'] = 1000 cutoff_dfs[0]['svtype'] = 'CNV' # PE/SR <1 kb testable = metrics.loc[~metrics.name.str.contains('_depth_') & (metrics.svsize < 1000)] trainable = testable.loc[(testable.svsize >= 100) & (testable.poor_region_cov < 0.3) & ~testable.chrom.isin(ALLOSOMES) & ~testable.is_outlier_specific] testable.to_csv('RD_pesr_lt5kb_testable.txt', index=False, sep='\t') trainable['label'] = labeler.label(trainable) trainable.to_csv('RD_pesr_lt5kb_trainable.txt', index=False, sep='\t') cutoffs = rf_classify(metrics, trainable, testable, features, labeler, cutoff_features, 'RD_prob') cutoff_dfs.append(cutoffs) cutoff_dfs[1]['algtype'] = 'PESR' cutoff_dfs[1]['max_svsize'] = 1000 cutoff_dfs[1]['min_svsize'] = 0 cutoff_dfs[1]['svtype'] = 'CNV' # Depth dels cutoff_features = { 'indep': ['RD_log_pval', 'RD_Median_Separation'], 'dep': [] } testable = metrics.loc[metrics.name.str.contains('_depth_') & (metrics.svtype == 'DEL')] trainable = testable.loc[(testable.svsize >= 5000) & (testable.poor_region_cov < 0.3) & ~testable.chrom.isin(ALLOSOMES) & ~testable.is_outlier_specific] trainable['label'] = labeler.label(trainable) trainable.to_csv('RD_depth_DEL_trainable.txt', index=False, sep='\t') cutoffs = rf_classify(metrics, trainable, testable, features, labeler, cutoff_features, 'RD_prob', clean_cutoffs=True) cutoff_dfs.append(cutoffs) cutoff_dfs[2]['algtype'] = 'Depth' cutoff_dfs[2]['max_svsize'] = np.nan cutoff_dfs[2]['min_svsize'] = 5000 cutoff_dfs[2]['svtype'] = 'DEL' # Depth dups testable = metrics.loc[metrics.name.str.contains('_depth_') & (metrics.svtype == 'DUP')] trainable = testable.loc[(testable.svsize >= 5000) & (testable.poor_region_cov < 0.3) & ~testable.chrom.isin(ALLOSOMES) & ~testable.is_outlier_specific] trainable['label'] = labeler.label(trainable) trainable.to_csv('RD_depth_DUP_trainable.txt', index=False, sep='\t') cutoffs = rf_classify(metrics, trainable, testable, features, labeler, cutoff_features, 'RD_prob', clean_cutoffs=True) cutoff_dfs.append(cutoffs) cutoff_dfs[3]['algtype'] = 'Depth' cutoff_dfs[3]['max_svsize'] = np.nan cutoff_dfs[3]['min_svsize'] = 5000 cutoff_dfs[3]['svtype'] = 'DUP' # Fail depth-only below 5 kb metrics.loc[metrics.name.str.contains('_depth_') & (metrics.svsize < 5000) & (metrics.RD_prob >= 0.5), 'RD_prob'] = 0.499 cutoffs = pd.concat(cutoff_dfs) cutoffs.to_csv('RD_cutoffs.txt', index=False, sep='\t') cutoffs['test'] = 'RD' return cutoffs