Beispiel #1
0
def multiple_loci_mixed_model_gwas(phenotype_id=5, pvalue_file_prefix='mlmm_results',
                                   result_files_prefix='mlmm_manhattan', max_num_steps=10, snp_priors=None):
    """
    Perform multiple loci mixed model GWAS for flowering time (phenotype_id=5 in the phenotype file) 
    in plants grown under 10C conditions. 
    """
    import linear_models as lm
    import kinship
    # Load genotypes
    sd = load_a_thaliana_genotypes()

    # Load phenotypes
    phend = load_a_thaliana_phenotypes()

    # Coordinate phenotype of interest and genotypes.  This filters the genotypes and
    # phenotypes, leaving only accessions (individuals) which overlap between both,
    # and SNPs that are polymorphic in the resulting subset.
    sd.coordinate_w_phenotype_data(phend, phenotype_id)

    # Calculate kinship (IBS)
    K = kinship.calc_ibs_kinship(sd.get_snps())

    # Perform multiple loci mixed model GWAS
    mlmm_results = lm.mlmm(phend.get_values(phenotype_id), K, sd=sd,
                           num_steps=max_num_steps, file_prefix=result_files_prefix,
                           save_pvals=True, pval_file_prefix=result_files_prefix, snp_priors=snp_priors)
Beispiel #2
0
def multiple_loci_mixed_model_gwas(phenotype_id=5,
                                   pvalue_file_prefix='mlmm_results',
                                   result_files_prefix='mlmm_manhattan',
                                   max_num_steps=10,
                                   snp_priors=None):
    """
    Perform multiple loci mixed model GWAS for flowering time (phenotype_id=5 in the phenotype file) 
    in plants grown under 10C conditions. 
    """
    import linear_models as lm
    import kinship
    # Load genotypes
    sd = load_a_thaliana_genotypes()

    # Load phenotypes
    phend = load_a_thaliana_phenotypes()

    # Coordinate phenotype of interest and genotypes.  This filters the genotypes and
    # phenotypes, leaving only accessions (individuals) which overlap between both,
    # and SNPs that are polymorphic in the resulting subset.
    sd.coordinate_w_phenotype_data(phend, phenotype_id)

    # Calculate kinship (IBS)
    K = kinship.calc_ibs_kinship(sd.get_snps())

    # Perform multiple loci mixed model GWAS
    mlmm_results = lm.mlmm(phend.get_values(phenotype_id),
                           K,
                           sd=sd,
                           num_steps=max_num_steps,
                           file_prefix=result_files_prefix,
                           save_pvals=True,
                           pval_file_prefix=result_files_prefix,
                           snp_priors=snp_priors)
Beispiel #3
0
    def get_ibs_kinship_matrix(self, debug_filter=1, snp_dtype='int8', dtype='single',chunk_size=None):
        """
        Calculate the IBS kinship matrix.
        (un-scaled)

        Currently it works only for binary kinship matrices.
        """
        log.debug('Starting kinship calculation')
        return kinship.calc_ibs_kinship(self,chunk_size=chunk_size)
Beispiel #4
0
    def get_ibs_kinship_matrix(self, debug_filter=1, snp_dtype='int8', dtype='single',chunk_size=None):
        """
        Calculate the IBS kinship matrix.
        (un-scaled)

        Currently it works only for binary kinship matrices.
        """
        log.debug('Starting kinship calculation')
        return kinship.calc_ibs_kinship(self,chunk_size=chunk_size)
Beispiel #5
0
def mixed_model_gwas(phenotype_id=5,
                     pvalue_file='mm_results.pvals',
                     manhattan_plot_file='mm_manhattan.png',
                     qq_plot_file_prefix='mm_qq'):
    """
    Perform mixed model (EMMAX) GWAS for flowering time (phenotype_id=5 in the phenotype file) 
    in plants grown under 10C conditions. 
    """
    import linear_models as lm
    import kinship
    import gwaResults as gr
    # Load genotypes
    sd = load_a_thaliana_genotypes()

    # Load phenotypes
    phend = load_a_thaliana_phenotypes()

    # Coordinate phenotype of interest and genotypes.  This filters the genotypes and
    # phenotypes, leaving only accessions (individuals) which overlap between both,
    # and SNPs that are polymorphic in the resulting subset.
    sd.coordinate_w_phenotype_data(phend, phenotype_id)

    # Calculate kinship (IBS)
    K = kinship.calc_ibs_kinship(sd.get_snps())

    # Perform mixed model GWAS
    mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K)

    # Construct a results object
    res = gr.Result(scores=mm_results['ps'], snps_data=sd)

    # Save p-values to file
    res.write_to_file(pvalue_file)

    # Plot Manhattan plot
    res.plot_manhattan(png_file=manhattan_plot_file,
                       percentile=90,
                       plot_bonferroni=True,
                       neg_log_transform=True)
    # Plot a QQ-plot
    res.plot_qq(qq_plot_file_prefix)
Beispiel #6
0
def lotus_mixed_model_gwas(phenotype_id=4, phen_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/20181113_136LjAccessionData.csv', 
                           gt_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/all_chromosomes_binary.csv', 
                           pvalue_file='mm_results.pvals', manhattan_plot_file='mm_manhattan.png', qq_plot_file_prefix='mm_qq'):
    """
    Perform mixed model (EMMAX) GWAS for Lotus data
    """
    import linear_models as lm
    import kinship
    import gwaResults as gr
    import dataParsers as dp
    # Load genotypes
    sd = dp.parse_snp_data(gt_file)

    # Load phenotypes
    import phenotypeData as pd
    phend = pd.parse_phenotype_file(phen_file, with_db_ids=False)
    
    # Coordinate phenotype of interest and genotypes.  This filters the genotypes and 
    # phenotypes, leaving only accessions (individuals) which overlap between both, 
    # and SNPs that are polymorphic in the resulting subset.
    sd.coordinate_w_phenotype_data(phend, phenotype_id)

    # Calculate kinship (IBS)
    K = kinship.calc_ibs_kinship(sd.get_snps())

    # Perform mixed model GWAS
    mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K)

    # Construct a results object
    res = gr.Result(scores=mm_results['ps'], snps_data=sd)

    # Save p-values to file
    res.write_to_file(pvalue_file)

    # Plot Manhattan plot
    res.plot_manhattan(png_file=manhattan_plot_file, percentile=90, plot_bonferroni=True,
                       neg_log_transform=True)
    # Plot a QQ-plot
    res.plot_qq(qq_plot_file_prefix)
Beispiel #7
0
def mixed_model_gwas(phenotype_id=5, pvalue_file='mm_results.pvals',
                     manhattan_plot_file='mm_manhattan.png',
                     qq_plot_file_prefix='mm_qq'):
    """
    Perform mixed model (EMMAX) GWAS for flowering time (phenotype_id=5 in the phenotype file) 
    in plants grown under 10C conditions. 
    """
    import linear_models as lm
    import kinship
    import gwaResults as gr
    # Load genotypes
    sd = load_a_thaliana_genotypes()

    # Load phenotypes
    phend = load_a_thaliana_phenotypes()

    # Coordinate phenotype of interest and genotypes.  This filters the genotypes and
    # phenotypes, leaving only accessions (individuals) which overlap between both,
    # and SNPs that are polymorphic in the resulting subset.
    sd.coordinate_w_phenotype_data(phend, phenotype_id)

    # Calculate kinship (IBS)
    K = kinship.calc_ibs_kinship(sd.get_snps())

    # Perform mixed model GWAS
    mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K)

    # Construct a results object
    res = gr.Result(scores=mm_results['ps'], snps_data=sd)

    # Save p-values to file
    res.write_to_file(pvalue_file)

    # Plot Manhattan plot
    res.plot_manhattan(png_file=manhattan_plot_file, percentile=90, plot_bonferroni=True,
                       neg_log_transform=True)
    # Plot a QQ-plot
    res.plot_qq(qq_plot_file_prefix)
Beispiel #8
0
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'):
    """
    Perform a simple MLM GWAS for the 8 traits
    """
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    for phenotype in phenotypes:
        for env in envs:
            print phenotype, env
            s1 = time.time()
            d = hdf5_data.coordinate_cegs_genotype_phenotype(
                phen_dict, phenotype, env)
            print 'Calculating kinship'
            if kinship_type == 'ibs':
                K = kinship.calc_ibs_kinship(d['snps'])
            elif kinship_type == 'ibd':
                K = kinship.calc_ibd_kinship(d['snps'])
            else:
                raise NotImplemented

            if phen_type == 'means':
                lmm = lm.LinearMixedModel(d['Y_means'])
            elif phen_type == 'medians':
                lmm = lm.LinearMixedModel(d['Y_medians'])
            else:
                raise NotImplementedError
            lmm.add_random_effect(K)

            print "Running EMMAX"
            res = lmm.emmax_f_test(d['snps'], emma_num=1000)
            print 'Mean p-value:', sp.mean(res['ps'])

            secs = time.time() - s1
            if secs > 60:
                mins = int(secs) / 60
                secs = secs - mins * 60
                print 'Took %d mins and %f seconds.' % (mins, secs)
            else:
                print 'Took %f seconds.' % (secs)

            #Now generating QQ-plots
            label_str = '%s_%s_%s_%s' % (kinship_type, phenotype, env,
                                         phen_type)
            agr.plot_simple_qqplots_pvals(
                '/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str),
                [res['ps']],
                result_labels=[label_str],
                line_colors=['green'],
                num_dots=1000,
                title=None,
                max_neg_log_val=6)

            # Perform multiple loci mixed model GWAS
            chromosomes = d['positions'][:, 0]
            positions = sp.array(d['positions'][:, 1], 'int32')
            x_positions = []
            y_log_pvals = []
            colors = []
            x_shift = 0
            for i, chrom in enumerate(sp.unique(chromosomes)):
                if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']:
                    colors.append('c')
                else:  # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra']
                    #Toss U and Hets
                    colors.append('m')
                chrom_filter = sp.in1d(chromosomes, chrom)
                positions_slice = positions[chrom_filter]
                x_positions.append(positions_slice + x_shift)
                x_shift += positions_slice.max()
                log_ps_slice = -sp.log10(res['ps'][chrom_filter])
                y_log_pvals.append(log_ps_slice)

            m = len(positions)
            log_bonf = -sp.log10(1 / (20.0 * m))
            print m, log_bonf

            # Plot manhattan plots?
            plt.figure(figsize=(12, 4))
            plt.axes([0.03, 0.1, 0.95, 0.8])
            for i, chrom in enumerate(sp.unique(chromosomes)):
                plt.plot(x_positions[i],
                         y_log_pvals[i],
                         c=colors[i],
                         ls='',
                         marker='.')
            xmin, xmax = plt.xlim()
            plt.hlines(log_bonf,
                       xmin,
                       xmax,
                       colors='k',
                       linestyles='--',
                       alpha=0.5)
            plt.title('%s, %s' % (phenotype, env))
            plt.savefig(
                '/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png'
                % (kinship_type, phenotype, env, phen_type))
Beispiel #9
0
def perform_cegs_gwas(kinship_type='ibd', phen_type='medians'):
    """
    Perform a simple MLM GWAS for the 8 traits
    """
    import hdf5_data
    import kinship
    import linear_models as lm
    import time
    import scipy as sp
    from matplotlib import pyplot as plt
    import analyze_gwas_results as agr
    phen_dict = hdf5_data.parse_cegs_drosophila_phenotypes()

    phenotypes = ['Protein', 'Sugar', 'Triglyceride', 'weight']
    envs = ['mated', 'virgin']
    for phenotype in phenotypes:
        for env in envs:
            print phenotype, env
            s1 = time.time()
            d = hdf5_data.coordinate_cegs_genotype_phenotype(
                phen_dict, phenotype, env)
            print 'Calculating kinship'
            if kinship_type == 'ibs':
                K = kinship.calc_ibs_kinship(d['snps'])
            elif kinship_type == 'ibd':
                K = kinship.calc_ibd_kinship(d['snps'])
            else:
                raise NotImplementedError

            if phen_type == 'means':
                lmm = lm.LinearMixedModel(d['Y_means'])
            elif phen_type == 'medians':
                lmm = lm.LinearMixedModel(d['Y_medians'])
            else:
                raise NotImplementedError
            lmm.add_random_effect(K)

            print "Running EMMAX"
            res = lmm.emmax_f_test(d['snps'], emma_num=1000)
            print 'Mean p-value:', sp.mean(res['ps'])

            secs = time.time() - s1
            if secs > 60:
                mins = int(secs) / 60
                secs = secs - mins * 60
                print 'Took %d mins and %f seconds.' % (mins, secs)
            else:
                print 'Took %f seconds.' % (secs)

            # Now generating QQ-plots
            label_str = '%s_%s_%s_%s' % (
                kinship_type, phenotype, env, phen_type)
            agr.plot_simple_qqplots_pvals('/Users/bjarnivilhjalmsson/data/tmp/cegs_qq_%s' % (label_str),
                                          [res['ps']], result_labels=[
                                              label_str], line_colors=['green'],
                                          num_dots=1000, title=None, max_neg_log_val=6)

            # Perform multiple loci mixed model GWAS
            chromosomes = d['positions'][:, 0]
            positions = sp.array(d['positions'][:, 1], 'int32')
            x_positions = []
            y_log_pvals = []
            colors = []
            x_shift = 0
            for i, chrom in enumerate(sp.unique(chromosomes)):
                if chrom in ['2L', '2LHet', '3L', '3LHet', '4', 'X', 'XHet']:
                    colors.append('c')
                else:  # chrom in ['2R', '2RHet', '3R', '3RHet', 'U', 'Uextra']
                    # Toss U and Hets
                    colors.append('m')
                chrom_filter = sp.in1d(chromosomes, chrom)
                positions_slice = positions[chrom_filter]
                x_positions.append(positions_slice + x_shift)
                x_shift += positions_slice.max()
                log_ps_slice = -sp.log10(res['ps'][chrom_filter])
                y_log_pvals.append(log_ps_slice)

            m = len(positions)
            log_bonf = -sp.log10(1 / (20.0 * m))
            print m, log_bonf

            # Plot manhattan plots?
            plt.figure(figsize=(12, 4))
            plt.axes([0.03, 0.1, 0.95, 0.8])
            for i, chrom in enumerate(sp.unique(chromosomes)):
                plt.plot(x_positions[i], y_log_pvals[i],
                         c=colors[i], ls='', marker='.')
            xmin, xmax = plt.xlim()
            plt.hlines(log_bonf, xmin, xmax, colors='k',
                       linestyles='--', alpha=0.5)
            plt.title('%s, %s' % (phenotype, env))
            plt.savefig('/Users/bjarnivilhjalmsson/data/tmp/cegs_gwas_%s_%s_%s_%s.png' %
                        (kinship_type, phenotype, env, phen_type))