def linear_model(snps, phenotypes, cofactors=None): lm = LinearModel(phenotypes) if cofactors: for cofactor in cofactors: lm.add_factor(cofactor) log.info("Running a standard linear model") t = Timer() res = lm.fast_f_test(snps) log.info('Took: %s' % t.stop(True)) return res
def anova(snps, phenotypes): """ Run EMMAX """ lmm = LinearModel(phenotypes) log.info("Running ANOVA") t = Timer() res = lmm.anova_f_test(snps) log.info('Took: %s' % t.stop(True)) return res
def mm_lrt_test(y, K): """ Likelihood ratio test for whether the data (y) fits a mixed model with two random terms fits significantly better. """ lm = LinearModel(y) lmm = LinearMixedModel(y) lmm.add_random_effect(K) lmm_res = lmm.get_ML() ll0 = lm.get_ll() ll1 = lmm_res['max_ll'] D = 2 * (ll1 - ll0) pval = sp.chi2.sf(D, 1) return {'pval':pval, 'lrt_stat':D}
def lm_step_wise(phenotypes, sd=None, num_steps=10, file_prefix=None, forward_backwards=True, local=False, cand_gene_list=None, plot_xaxis=True, with_qq_plots=True, sign_threshold=None, log_qq_max_val=5, highlight_loci=None, save_pvals=False, markersize=3, chrom_col_map=None, **kwargs): """ Run simple step-wise linear model forward-backward. """ # import plotResults as pr if local: with_qq_plots = False if sd: kwargs['snps'] = sd.getSnps() kwargs['positions'] = sd.getPositions() kwargs['chromosomes'] = sd.get_chr_list() d = sd.get_mafs() kwargs['macs'] = d['mafs'] kwargs['mafs'] = d['marfs'] snps = kwargs['snps'][:] positions = kwargs['positions'][:] chromosomes = kwargs['chromosomes'][:] mafs = kwargs['mafs'][:] macs = kwargs['macs'][:] chr_pos_list = zip(chromosomes, positions) lm = LinearModel(phenotypes) num_snps = len(snps) if not sign_threshold: # Then use Bonferroni threshold sign_threshold = 1.0 / (num_snps * 20.0) print "Running step-wise LM" s1 = time.time() step_info_list = [] cofactors = [] # A list of the loci found, together with their statistics. cofactor_snps = [] step_i = 0 num_par = 2 # mean and variance scalar rss = lm.get_rss() ll = lm.get_ll(rss) criterias = {'ebics':[], 'mbics':[], 'bonf':[], 'mbonf':[]} (bic, extended_bic, modified_bic) = _calc_bic_(ll, num_snps, num_par, lm.n) # Calculate the BICs criterias['ebics'].append(extended_bic) criterias['mbics'].append(modified_bic) max_cofactor_pval = 0 criterias['mbonf'].append(max_cofactor_pval) criterias['bonf'].append(0) action = 'None' print '\nStep %d: action=%s, num_par=%d, ll=%0.2f, rss=%0.2f, bic=%0.2f, extended_bic=%0.2f, modified_bic=%0.2f' % \ (step_i, action, num_par, ll, rss, bic, extended_bic, modified_bic) print 'Cofactors:', _cofactors_to_string_(cofactors) quantiles_dict = {'log':[], 'norm':[], 'labels':[]} for step_i in range(1, num_steps + 1): lm_res = lm.fast_f_test(snps) if step_i == 1: first_lm_res = lm_res min_pval_i = sp.argmin(lm_res['ps']) min_pval = lm_res['ps'][min_pval_i] min_pval_chr_pos = chr_pos_list[min_pval_i] print 'Min p-value:', min_pval criterias['bonf'].append(min_pval) step_info = {'rss':rss, 'll':ll, 'bic':bic, 'e_bic':extended_bic, 'm_bic':modified_bic, 'mbonf':max_cofactor_pval, 'cofactors':map(tuple, cofactors[:]), 'cofactor_snps':cofactor_snps[:], 'min_pval':min_pval, 'min_pval_chr_pos': min_pval_chr_pos} lm_pvals = lm_res['ps'].tolist() # Plot gwas results per step if file_prefix: _plot_manhattan_and_qq_(file_prefix, step_i - 1, lm_pvals, quantiles_dict, positions=positions, chromosomes=chromosomes, mafs=mafs, macs=macs, plot_bonferroni=True, highlight_markers=cofactors, cand_genes=cand_gene_list, plot_xaxis=plot_xaxis, log_qq_max_val=log_qq_max_val, with_qq_plots=with_qq_plots, highlight_loci=highlight_loci, write_pvals=save_pvals, markersize=markersize, chrom_col_map=chrom_col_map) if save_pvals: step_info['ps'] = lm_pvals if cand_gene_list: # Calculate candidate gene enrichments. pass step_info['kolmogorov_smirnov'] = agr.calc_ks_stats(lm_pvals) step_info['pval_median'] = agr.calc_median(lm_pvals) print step_info['kolmogorov_smirnov'], step_info['pval_median'] step_info_list.append(step_info) # Adding the new SNP as a cofactor lm.add_factor(snps[min_pval_i]) cofactor_snps.append(snps[min_pval_i]) rss = lm.get_rss() ll = lm.get_ll(rss) num_par += 1 action = '+' cofactors.append([min_pval_chr_pos[0], min_pval_chr_pos[1], min_pval]) # Re-estimate the p-value of the cofactors... with the smallest in the list. cofactor_pvals = [] for i, snp in enumerate(cofactor_snps): t_cofactors = cofactor_snps[:] del t_cofactors[i] lm.set_factors(t_cofactors) pval = lm.fast_f_test([snp])['ps'][0] cofactor_pvals.append(pval) cofactors[i][2] = -math.log10(pval) lm.set_factors(cofactor_snps) max_cofactor_pval = max(cofactor_pvals) criterias['mbonf'].append(max_cofactor_pval) # Remove the found SNP from considered SNPs del snps[min_pval_i] del positions[min_pval_i] del chromosomes[min_pval_i] del chr_pos_list[min_pval_i] del mafs[min_pval_i] del macs[min_pval_i] num_snps -= 1 (bic, extended_bic, modified_bic) = _calc_bic_(ll, num_snps, num_par, lm.n) # Calculate the BICs criterias['ebics'].append(extended_bic) criterias['mbics'].append(modified_bic) print '\nStep %d: action=%s, num_par=%d, ll=%0.2f, rss=%0.2f, bic=%0.2f, extended_bic=%0.2f, modified_bic=%0.2f' % \ (step_i, action, num_par, ll, rss, bic, extended_bic, modified_bic) print 'Cofactors:', _cofactors_to_string_(cofactors) lm_res = lm.fast_f_test(snps) min_pval_i = sp.argmin(lm_res['ps']) min_pval = lm_res['ps'][min_pval_i] min_pval_chr_pos = chr_pos_list[min_pval_i] print 'Min p-value:', min_pval step_info = {'rss':rss, 'll':ll, 'bic':bic, 'e_bic':extended_bic, 'm_bic':modified_bic, 'mbonf':max_cofactor_pval, 'cofactors':map(tuple, cofactors[:]), 'cofactor_snps':cofactor_snps[:], 'min_pval':min_pval, 'min_pval_chr_pos': min_pval_chr_pos} lm_pvals = lm_res['ps'].tolist() if save_pvals: step_info['ps'] = lm_pvals # Now plotting! print "Generating plots" if file_prefix: _plot_manhattan_and_qq_(file_prefix, step_i, lm_pvals, quantiles_dict, positions=positions, chromosomes=chromosomes, mafs=mafs, macs=macs, plot_bonferroni=True, highlight_markers=cofactors, cand_genes=cand_gene_list, plot_xaxis=plot_xaxis, log_qq_max_val=log_qq_max_val, with_qq_plots=with_qq_plots, highlight_loci=highlight_loci, write_pvals=save_pvals, markersize=markersize, chrom_col_map=chrom_col_map) max_num_cofactors = len(cofactors) step_info['kolmogorov_smirnov'] = agr.calc_ks_stats(lm_pvals) step_info['pval_median'] = agr.calc_median(lm_pvals) print step_info['kolmogorov_smirnov'], step_info['pval_median'] step_info_list.append(step_info) # Now backward stepwise. if forward_backwards: print 'Starting backwards..' while len(cofactor_snps) > 1: step_i += 1 f_stats = sp.zeros(len(cofactor_snps)) for i, snp in enumerate(cofactor_snps): t_cofactors = cofactor_snps[:] del t_cofactors[i] lm.set_factors(t_cofactors) res = lm.fast_f_test([snp]) cofactors[i][2] = -math.log10(res['ps'][0]) f_stats[i] = res['f_stats'][0] i_to_remove = f_stats.argmin() del cofactor_snps[i_to_remove] del cofactors[i_to_remove] lm.set_factors(cofactor_snps) num_snps += 1 # Re-estimating the REML and ML. rss = lm.get_rss() ll = lm.get_ll(rss) num_par -= 1 action = '-' # Update the p-values cofactor_pvals = [] for i, snp in enumerate(cofactor_snps): t_cofactors = cofactor_snps[:] del t_cofactors[i] lm.set_factors(t_cofactors) res = lm.fast_f_test([snp]) pval = res['ps'][0] cofactor_pvals.append(pval) cofactors[i][2] = -math.log10(pval) max_cofactor_pval = max(cofactor_pvals) criterias['mbonf'].append(max_cofactor_pval) # Calculate the BICs (bic, extended_bic, modified_bic) = _calc_bic_(ll, num_snps, num_par, lm.n) criterias['ebics'].append(extended_bic) criterias['mbics'].append(modified_bic) print '\nStep %d: action=%s, num_par=%d, ll=%0.2f, rss=%0.2f, bic=%0.2f, extended_bic=%0.2f, modified_bic=%0.2f' % \ (step_i, action, num_par, ll, rss, bic, extended_bic, modified_bic) print 'Cofactors:', _cofactors_to_string_(cofactors) step_info = {'rss':rss, 'll':ll, 'bic':bic, 'e_bic':extended_bic, 'm_bic':modified_bic, 'mbonf':max_cofactor_pval, 'cofactors':map(tuple, cofactors[:]), 'cofactor_snps':cofactor_snps[:], 'min_pval':None, 'min_pval_chr_pos':None, 'kolmogorov_smirnov':None, 'pval_median':None} step_info_list.append(step_info) print cofactors opt_dict, opt_indices = _analyze_opt_criterias_(criterias, sign_threshold, max_num_cofactors, file_prefix, with_qq_plots, lm, step_info_list, quantiles_dict, plot_bonferroni=True, cand_genes=cand_gene_list, plot_xaxis=plot_xaxis, log_qq_max_val=log_qq_max_val, type='lm', highlight_loci=highlight_loci, write_pvals=save_pvals, markersize=markersize, chrom_col_map=chrom_col_map, **kwargs) for step_i in opt_indices: for h in ['min_pval', 'min_pval_chr_pos', 'kolmogorov_smirnov', 'pval_median']: step_info_list[step_i][h] = opt_indices[step_i][h] if file_prefix: _plot_stepwise_stats_(file_prefix, step_info_list, sign_threshold, type == 'lm') res_dict = {'step_info_list':step_info_list, 'first_lm_res':first_lm_res, 'opt_dict':opt_dict} secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs - mins * 60 print 'Took %d mins and %f seconds.' % (mins, secs) else: print 'Took %f seconds.' % (secs) return res_dict
def lin_reg_step(phen_vals, sd, cof_chr_pos_list, progress_file_writer=None, plot_prefix=None): """ Standard linear regression single SNPs.. Returns various stats useful for stepwise regression. """ import bisect t = Timer() lm = LinearModel(phen_vals) h0_rss = lm.get_rss() step_dict = {} step_dict['h0_rss'] = h0_rss log.info('Looking up cofactors') cof_indices,cof_snps = sd.get_snps_from_pos(cof_chr_pos_list) lm.set_factors(cof_snps) if not progress_file_writer == None: progress_file_writer.update_progress_bar(progress=0.20, task_status='Performing linear regression') progress_file_writer.set_step(0.05) log.info('Performing linear regression.') r = lm.fast_f_test(sd, progress_file_writer=progress_file_writer) min_pval_i = sp.argmin(r['ps']) step_dict['min_pval_i'] = min_pval_i step_dict['min_pval'] = r['ps'][min_pval_i] step_dict['mahalanobis_rss'] = r['rss'][min_pval_i] step_dict['min_pval_chr_pos'] = sd.get_chr_pos_from_index(min_pval_i) num_snps = sd.num_snps num_par = lm.X.shape[1] + 1 step_dict['num_snps'] = num_snps step_dict['num_par'] = num_par rss = lm.get_rss() ll = lm.get_ll(rss) (bic, extended_bic, modified_bic) = _calc_bic_(ll, num_snps, num_par, lm.n) # Calculate the BICs step_dict['ebic'] = extended_bic step_dict['mbic'] = modified_bic step_dict['bic'] = bic step_dict['rss'] = rss perc_var_expl = 1.0 - (rss / h0_rss) step_dict['perc_var_expl'] = perc_var_expl # Calculate maximum cofactor p-value\ log.info('Updating the cofactor p-values') cof_pvals = [] cof_chrom_pos_pval_list = [] for i, snp in enumerate(cof_snps): t_cofactors = cof_snps[:] del t_cofactors[i] lm.set_factors(t_cofactors) res = lm.fast_f_test([snp]) cof_pval = res['ps'][0] cof_pvals.append(cof_pval) cof_chrom_pos_pval_list.append((cof_chr_pos_list[i][0], cof_chr_pos_list[i][1], -math.log10(cof_pval))) for i, pval in zip(cof_indices, cof_pvals): r['ps'][i] = pval if len(cof_pvals): step_dict['max_cof_pval'] = max(cof_pvals) else: step_dict['max_cof_pval'] = 0.0 log.info('Took:%s' % t.stop(True)) return {'stats':step_dict, 'res':r,'cof_chrom_pos_pval_list':cof_chrom_pos_pval_list}