Exemple #1
0
def parse_phenotype_data(file_name='/tmp/test_phen.csv'):
    phen_dict = {}
    for pid in range(1, 7):
        phen_dict[pid] = {
            'name': 'criteria_%d' % pid,
            'ecotypes': [],
            'values': [],
            'transformation': None
        }
    et_set = set()
    with open(env.env['data_dir'] + 'phenot_HybridIncompatibilities.csv') as f:
        f.next()
        for line in f:
            l = map(str.strip, line.split(','))
            et = '%s_%s' % (l[0], l[1])
            if not et in et_set:
                et_set.add(et)
                for pid in range(1, 7):
                    val = l[pid + 1]
                    if val != 'NA':
                        phen_dict[pid]['values'].append(int(val))
                        phen_dict[pid]['ecotypes'].append(et)
    for pid in range(1, 7):
        phen_dict[pid]['raw_values'] = phen_dict[pid]['values']
    phend = pd.phenotype_data(phen_dict=phen_dict)
    phend.write_to_file(file_name)
def load_gene_expression_traits_3(temperature='10C'):
	import scipy as sp
	filename = env['home_dir'] + \
			'/Projects/Data/rna_seq/expression_variance_stabilized_11_08_11/' + \
			'expr_%s_merged.csv' % temperature
	phen_dict = {}
	phen_i = 1
	with open(filename) as f:
		header = (f.next().strip()).split(',')
		ets = map(lambda x: x[1:], header[1:])
		for i in range(len(ets)):
			if int(ets[i]) == 2:
				ets[i] = '6932'
			elif int(ets[i]) == 3:
				ets[i] = '6980'
			if int(ets[i]) == 4:
				ets[i] = 'ALyr'
			elif int(ets[i]) == 5:
				ets[i] = 'ACap'
		print ets
		for l in f:
			line = (l.strip()).split(',')
			gene_name = line[0]
			vals = map(float, line[1:])
			phen_dict[phen_i] = {'name':gene_name, 'ecotypes':ets, 'values':vals}
			phen_i += 1

	phed = pd.phenotype_data(phen_dict)
	print 'Phenotype object constructed with %d phenotypes, now writing to phenotype file' % len(phen_dict)
	phed.write_to_file(env['phen_dir'] + 'rna_seq_vs_081411_%s.csv' % temperature)
def load_phentoype_file_duszynska():
	fn1 = env['phen_dir'] + 'seed_size_2n.csv'
	fn2 = env['phen_dir'] + 'seed_size_3n_2x4.csv'
	fn3 = env['phen_dir'] + 'seed_size_3n_4x2.csv'
	fn4 = env['phen_dir'] + 'seed_size_spss.csv'
	fns = [fn1, fn2, fn3, fn4]
	phen_names = ['seed_size_2n', 'seed_size_3n_2x4', 'seed_size_3n_4x2', 'seed_size_spss']
	phen_dict = {}
	for i, pn in enumerate(phen_names):
		phen_dict[i + 1] = {'name':pn }
	accs_list = []
	for i, fn in enumerate(fns):
		f = open(fn, "r")
		acc_dict = pd.get_250K_accession_to_ecotype_dict()
		ecotypes = []
		values = []
		print f.next()
		for line in f:
			l = map(str.strip, line.split(','))
			acc = l[0].lower()
			if not acc in acc_dict:
				print "(%s) is missing in dictionary" % (acc)
			else:
				ecotypes.append(acc_dict[acc][4])
				values.append(float(l[1]))
		f.close()
		phen_dict[i + 1]['ecotypes'] = ecotypes
		phen_dict[i + 1]['values'] = values


	phed = pd.phenotype_data(phen_dict)
	phed.write_to_file(env['phen_dir'] + 'seed_size.csv')
def load_phentoype_file_dilkes():
	filename = env['phen_dir'] + 'dilkes_metabolites.csv'
	f = open(filename, "r")
	print f.next()
	header = f.next()
	phen_names = map(str.strip, header.split(',')[2:])
	print phen_names
	pids = range(len(phen_names))
	accessions = []
	phen_dict = {}
	for pid, name in zip(pids, phen_names):
		phen_dict[pid] = {'name':name, 'ecotypes':[], 'values':[]}
	for line in f:
		l = map(str.strip, line.split(','))
		accessions.append(l[1].lower())
		for i in pids:
			phen_dict[i]['values'].append(float(l[2 + i]))

	f.close()
	acc_dict = pd.get_250K_accession_to_ecotype_dict()
#	acc_dict['buckhorn'] = [0, 0, 0, 0, 7033]
	acc_dict['sakhdara'] = [0, 0, 0, 0, 6962]
	ecotypes = []
	for acc in accessions:
		if not acc in acc_dict:
			print "%s is missing in dictionary" % acc
		else:
			ecotype = acc_dict[acc][4]
			ecotypes.append(ecotype)

	for pid, name in zip(pids, phen_names):
		phen_dict[pid]['ecotypes'] = ecotypes

	phed = pd.phenotype_data(phen_dict)
	phed.write_to_file(env['phen_dir'] + 'b_dilkes_metabolites.csv')
def load_duszynska_file4():
	"""
	Loads the heterosis data.
	"""
	fn = env['home_dir'] + 'Projects/duszynska_data/seed_size_heterosis.csv'
	acc_dict = pd.get_250K_accession_to_ecotype_dict()
	phen_dict = {}

	name_dict = {'knox-18':'kno-18', 'knox-10':'kno-10', 'kas-1':'kas-2', 'pu-2-7':'pu2-7', 'cs22491':'n13',
			'shahdara':'sha'}

	with open(fn) as f:
		ets = []
		header = f.next()
		phen_names = map(str.strip, header.split(','))
		et_indices = [0, 3, 6, 9]
		phen_names = [phen_names[1], phen_names[4], phen_names[7], phen_names[10]]
		for i, pn in zip([1, 2, 3, 4], phen_names):
			phen_dict[i] = {'name': pn, 'values':[], 'ecotypes':[]}
		for line in f:
			l = map(str.strip, line.split(','))
			for e_i, pid in zip(et_indices, [1, 2, 3, 4]):
				acc = l[e_i].lower()
				if acc in name_dict:
					acc = name_dict[acc]
				if not acc in acc_dict:
					print "(%s) is missing in dictionary" % (acc)
				else:
					phen_dict[pid]['ecotypes'].append(acc_dict[acc][4])
					phen_dict[pid]['values'].append(float(l[e_i + 1]))

	phed = pd.phenotype_data(phen_dict)
	phed.write_to_file(env['phen_dir'] + 'duszynska_heterosis_data.csv')
Exemple #6
0
def load_duszynska_file2():
    fn1 = env[
        'home_dir'] + 'Projects/duszynska_data/male_data_proportion_AN.csv'
    fn2 = env[
        'home_dir'] + 'Projects/duszynska_data/female_data_proportion_AN.csv'
    acc_dict = pd.get_250K_accession_to_ecotype_dict()
    phen_names = []
    phen_dict = {}

    name_dict = {
        'knox-18': 'kno-18',
        'knox-10': 'kno-10',
        'kas-1': 'kas-2',
        'pu-2-7': 'pu2-7',
        'cs22491': 'n13',
        'shahdara': 'sha'
    }

    with open(fn1) as f:
        ets = []
        header = f.next()
        phen_names = map(str.strip, header.split(',')[1:])
        for i, pn in zip([1, 2, 3], phen_names):
            phen_dict[i] = {'name': 'male_' + pn, 'values': []}
        for line in f:
            l = map(str.strip, line.split(','))
            acc = l[0].lower()
            if acc in name_dict:
                acc = name_dict[acc]
            if not acc in acc_dict:
                print "(%s) is missing in dictionary" % (acc)
            else:
                ets.append(acc_dict[acc][4])
                for pid in [1, 2, 3]:
                    phen_dict[pid]['values'].append(float(l[pid]))
        for pid in [1, 2, 3]:
            phen_dict[pid]['ecotypes'] = ets[:]

    with open(fn2) as f:
        ets = []
        header = f.next()
        phen_names = map(str.strip, header.split(',')[1:])
        for i, pn in zip([4, 5, 6], phen_names):
            phen_dict[i] = {'name': 'female_' + pn, 'values': []}
        for line in f:
            l = map(str.strip, line.split(','))
            acc = l[0].lower()
            if acc in name_dict:
                acc = name_dict[acc]
            if not acc in acc_dict:
                print "(%s) is missing in dictionary" % (acc)
            else:
                ets.append(acc_dict[acc][4])
                for pid in [4, 5, 6]:
                    phen_dict[pid]['values'].append(float(l[pid - 3]))
        for pid in [4, 5, 6]:
            phen_dict[pid]['ecotypes'] = ets[:]

    phed = pd.phenotype_data(phen_dict)
    phed.write_to_file(env['phen_dir'] + 'duszynska_data.csv')
Exemple #7
0
def load_skin_color_traits():
    dir_prefix = env['home_dir'] + 'Projects/data/skin_eye_color/'
    #dir_prefix = env.env['home_dir'] + 'Projects/Data/Skin_color/'
    filename = dir_prefix + 'CV685-skin_eye_color.txt'
    d = {
        1: {
            'name': 'skin_color',
            'ecotypes': [],
            'values': []
        },
        2: {
            'name': 'eye_color',
            'ecotypes': [],
            'values': []
        }
    }
    sc_vals = []
    ec_vals = []
    sc_iids = []  #individual IDs (ecotypes)
    with open(filename) as f:
        print f.next()
        for line in f:
            l = line.split()
            if int(float(l[2])) != -9:
                d[1]['values'].append(float(l[2]))
                d[1]['ecotypes'].append(l[1])
            if int(float(l[3])) != -9:
                d[2]['values'].append(float(l[3]))
                d[2]['ecotypes'].append(l[1])
    phed = pd.phenotype_data(d)
    phed.write_to_file(dir_prefix + 'phenotypes.csv')
    return phed
Exemple #8
0
def load_gene_expression_traits():
    filename = '/Users/bjarni.vilhjalmsson/Projects/Data/rna_seq/gene_expression_table_20110208.tsv'
    ecotypes = []
    et_ids = []
    import scipy as sp
    with open(filename, "r") as f:
        i = 0
        for line in f:
            if line[0] != '#': break
            l = line.split('\t')
            if l[7] == '16C':
                ecotypes.append(l[5])
                et_ids.append(i)
            i += 1
        gene_ids = line.split('\t')
        print len(ecotypes), len(set(ecotypes))
        print ecotypes
        et_dict = {
            'Col-0': '6909',
            'Col': '6909',
            'Ler': '6932',
            'Ws-0': '6980'
        }
        ets = []
        for et in ecotypes:
            if et in et_dict:
                ets.append(et_dict[et])
            else:
                ets.append(et)
        print ets
        phen_dict = {}
        num_const_phen = 0
        phen_i = 1
        for i, line in enumerate(f):  #For each gene
            l = line.split()
            phen_name = l[0]
            phen_vals = map(float, l[1:])
            phen_vals = [phen_vals[i] for i in et_ids]
            if len(phen_vals) != len(ets):
                raise Exception('Arrg')
            if len(sp.unique(phen_vals)) > 1:
                phen_dict[phen_i] = {
                    'name': phen_name,
                    'ecotypes': ets,
                    'values': phen_vals
                }
                phen_i += 1
            else:
                num_const_phen += 1
    print 'Total number of gene expressions was %d, of which %d were constant (removed), leaving %d gene expressions.' \
     % ((phen_i - 1) + num_const_phen, num_const_phen, phen_i - 1)

    phed = pd.phenotype_data(phen_dict)
    phed.write_to_file(env['phen_dir'] + 'rna_seq_020811_16C.csv')
Exemple #9
0
def load_phentoype_file_nc_resistance_3():
    filename = "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/20dd5_330.csv"
    with open(filename) as f:
        line = map(str.strip, f.next().split(','))
        phenotype_names = line[-1:]
        print phenotype_names
        phenotypes = []
        accession_names = []
        ecotypes = []
        full_accession_names = []
        for l in f:
            line = map(str.strip, l.split(','))
            accession_names.append(line[1].lower())
            ecotypes.append(line[0])
            full_accession_names.append(line[5].lower())
            phenotypes.append(line[4])

    print accession_names
    acc_dict = pd.get_accession_to_ecotype_id_dict(
        accession_names)  #+["n13","kno-10","kno-10","shahdara","nd-1"])
    #	acc_dict["cibc-5"] = 6908
    #	acc_dict["wa-1"] = 6978
    #	acc_dict["gu-0"] = 7149
    #	acc_dict['Rubezhnoe-1'] = 7323
    print len(acc_dict), acc_dict
    ets = []
    phen_vals = []
    for acc1, acc2, et, pt in zip(accession_names, full_accession_names,
                                  ecotypes, phenotypes):
        if acc1 in acc_dict:
            ecotype = acc_dict[acc1]

            if str(ecotype) != et and et != 'NA':
                print "Ecotype mismatch.. %s, %s, %s, %s" % (unicode(
                    acc1, "latin-1"), unicode(acc2, "latin-1"), et, ecotype)
            else:
                et = ecotype
            if et != 'NA' and et != '':
                ets.append(et)
                if not pt in ['R', 'S']: print pt
                phen_vals.append(0 if pt == 'R' else 1)
    print len(phen_vals)

    phen_dict = {
        1: {
            'name': 'resistance_20dd5',
            'ecotypes': ets,
            'values': phen_vals
        }
    }
    phed = pd.phenotype_data(phen_dict)
    phed.write_to_file('resistance_20dd5.csv', ',')
Exemple #10
0
def load_phentoype_file_riha():
    filename = env['phen_dir'] + 'telomere_lengths_192_raw.csv'
    f = open(filename, "r")
    phen_name = 'telomere_length'
    accession_names = []
    accession_ids = []
    parent_ids = []
    phen_vals = []
    print f.next()
    for line in f:
        l = map(str.strip, line.split(','))
        parent_ids.append(l[0])
        acc_l = l[1].split()
        acc_name = acc_l[0]
        if len(acc_l) > 1:
            acc_id = acc_l[1]
        else:
            acc_id = ''
        accession_names.append(acc_name.lower())
        accession_ids.append(acc_id)
        phen_vals.append(float(l[2]))

    f.close()
    print accession_names
    acc_dict = pd.get_250K_accession_to_ecotype_dict()
    acc_dict['buckhorn'] = [0, 0, 0, 0, 7033]
    acc_dict['shahdara'] = [0, 0, 0, 0, 6962]
    ecotypes = []
    uncertain_list = []
    new_phen_vals = []
    for acc, par_id, acc_id, phen_val in zip(accession_names, parent_ids,
                                             accession_ids, phen_vals):
        if not acc in acc_dict:
            print "(%s, %s, %s) is missing in dictionary" % (acc, par_id,
                                                             acc_id)
        else:
            ecotype = acc_dict[acc][4]
            ecotypes.append(ecotype)
            new_phen_vals.append(phen_val)

    print len(set(accession_names)), len(set(ecotypes))
    phen_dict = {
        1: {
            'name': phen_name,
            'ecotypes': ecotypes,
            'values': new_phen_vals
        }
    }

    phed = pd.phenotype_data(phen_dict)
    phed.write_to_file(env['phen_dir'] + 'telomere_lengths_192.csv')
Exemple #11
0
def load_total_expressions():
    import scipy as sp
    filename = env['home_dir'] + \
      'Projects/Data/rna_seq/expression_matrix_upload_data_5_10_2011/mapping_files/' + \
      'all_expression_matrix_5_09_2011_flagged_removed_libID_bioreps_combined_cov_filter.txt'
    print 'Loading file:', filename
    ets = {'10C': [], '16C': []}
    i_map = {}
    expressions_dict = {}
    with open(filename, "r") as f:
        i = 0
        line = f.next()
        l = map(str.strip, line.split())
        for i in range(2, len(l)):
            e_t_l = l[i].split('_')
            et = e_t_l[0][1:]
            t = e_t_l[1]
            i_map[i] = t
            ets[t].append(et)

        line = f.next()
        l = map(str.strip, line.split())
        gene_name = l[0]
        gene_type = l[1]
        d = {'gene_type': gene_type, '10C': [], '16C': []}
        for i in range(2, len(l)):
            t = i_map[i]
            val = float(l[i])
            d[t].append(val)
        expressions_dict[gene_name] = d
    print 'File was parsed, now constructing phenotype object'
    phen_dict = {}
    phen_i = 1
    for t in ['10C', '16C']:
        values = expressions_dict['GENE'][t]
        phen_dict[phen_i] = {
            'name': 'total_expression_%s' % t,
            'ecotypes': ets[t],
            'values': values
        }
        phen_i += 1

    phed = pd.phenotype_data(phen_dict)
    print 'Phenotype object constructed with %d phenotypes, now writing to phenotype file' % len(
        phen_dict)
    phed.write_to_file(env['phen_dir'] + 'rna_seq_061611_total.csv')
Exemple #12
0
def load_genome_size_factors():
    phen_file = env['phen_dir'] + 'measures_gene_model.tsv'
    phen_dict = {}
    ets = []
    with open(phen_file) as f:
        line = f.next()
        names = line.split()[1:]
        for i, name in enumerate(names):
            phen_dict[i + 1] = {
                'values': [],
                'name': name,
                'transformation': None
            }
        for line in f:
            l = line.split()
            ets.append(l[0])
            for i, v in enumerate(l[1:]):
                phen_dict[i + 1]['values'].append(float(v))
    for i in phen_dict:
        phen_dict[i]['ecotypes'] = ets[:]
        phen_dict[i]['raw_values'] = phen_dict[i]['values']
    phed = pd.phenotype_data(phen_dict)
    phed.write_to_file(env['phen_dir'] + 'measures_gene_model.csv')
def parse_NFBC_traits():
	phen_dict = {}
	height_file = env['data_dir'] + 'NFBC_20091001/pheno.Height'
	with open(height_file) as f:
		f.next()
		ets = []
		values = []
		for line in f:
			l = line.split()
			ets.append(int(l[0]))
			values.append(float(l[2]))
	phen_dict[1] = {'name':'height', 'ecotypes':ets, 'values':values}

	metabolite_file = env['data_dir'] + 'NFBC_20091001/MetaboPheno.txt'
	with open(metabolite_file) as f:
		line = f.next()
		l = map(str.strip, line.split())
		phen_names = l[2:]
		for i, pname in enumerate(phen_names):
			phen_dict[i + 2] = {'name':pname, 'ecotypes':[], 'values':[]}
		ets = []
		values = []
		for line in f:
			l = line.split()
			et = int(l[0])
			for i, v in enumerate(l[2:]):
				try:
					val = int(v)
				except Exception:
					val = float(v)
				if val != -9 and type(val) != int:
					phen_dict[i + 2]['ecotypes'].append(et)
					phen_dict[i + 2]['values'].append(val)
	phed = pd.phenotype_data(phen_dict)
	phed.write_to_file(env['data_dir'] + 'NFBC_20091001/phenotype.scv')
	return phed
def load_gene_expression_traits_2():
	import scipy as sp
	filename = env['home_dir'] + \
			'Projects/Data/rna_seq/expression_matrices_upload_8_01_2011/' + \
			'expression_matrix_wSNPmap_7_29_2011-bioreps_combined_cov_filter-normalized.txt'
	print 'Loading file:', filename
	ets = {'10C':[], '16C':[]}
	i_map = {}
	expressions_dict = {}
	with open(filename, "r") as f:
		i = 0
		line = f.next()
		l = map(str.strip, line.split())
		for i in range(2, len(l)):
			e_t_l = l[i].split('_')
			et = e_t_l[0][1:]
			t = e_t_l[1]
			i_map[i] = t
			if int(et) == 2:
				et = '6932'
			elif int(et) == 3:
				et = '6980'
			if int(et) == 4:
				et = 'ALyr'
			elif int(et) == 5:
				et = 'ACap'
			ets[t].append(et)

		for line in f:
			l = map(str.strip, line.split())
			gene_name = l[0]
			gene_type = l[1]
			d = {'gene_type':gene_type, '10C':[], '16C':[]}
			for i in range(2, len(l)):
				t = i_map[i]
				val = float(l[i])
				d[t].append(val)
			expressions_dict[gene_name] = d
	print 'File was parsed, now constructing phenotype object'
	phen_dict_10C = {}
	phen_dict_16C = {}
	phen_i = 1
	for gene_name in expressions_dict:
		values = expressions_dict[gene_name]['10C']
		if len(sp.unique(values)) > 1:
			phen_dict_10C[phen_i] = {'name':gene_name, 'ecotypes':ets['10C'],
						'values':expressions_dict[gene_name]['10C']}
		phen_i += 1

	phen_i = 1
	for gene_name in expressions_dict:
		values = expressions_dict[gene_name]['16C']
		if len(sp.unique(values)) > 1:
			phen_dict_16C[phen_i] = {'name':gene_name, 'ecotypes':ets['16C'],
						'values':expressions_dict[gene_name]['16C']}
		phen_i += 1


	phed_10C = pd.phenotype_data(phen_dict_10C)
	print 'Phenotype object constructed with %d phenotypes, now writing to phenotype file' % len(phen_dict_10C)
	phed_10C.write_to_file(env['phen_dir'] + 'rna_seq_081411_10C.csv')
	phed_16C = pd.phenotype_data(phen_dict_16C)
	print 'Phenotype object constructed with %d phenotypes, now writing to phenotype file' % len(phen_dict_16C)
	phed_16C.write_to_file(env['phen_dir'] + 'rna_seq_081411_16C.csv')
Exemple #15
0
    def perform_stepwise_gwas(self, phen_name, dataset, transformation, analysis_method, result_name, chromosome, position,
                              call_method_id=75, kinship_method='ibs',progress_file_writer=None):

        """
        Performs GWAS and updates the datastructure.
        """

        #if analysis_method not in ['emmax','lm']:
        #    raise Exception("Step-Wise GWAS only possible with emmax or LM")
        snp = ((int(chromosome), int(position)))
        result_group = self.h5file.getNode('/phenotypes/%s/%s/%s/%s' % (phen_name, dataset, transformation, analysis_method))
        result = result_group._f_getChild(result_name)
        cofactors = result._v_attrs.cofactors[:]
        co_var_snps = [(int(factors['chr']), int(factors['pos'])) for factors in cofactors if 'chr' in factors and 'pos' in factors]
        if snp in co_var_snps:
            raise Exception('The SNP %s,%s is already in the result' % chromosome, position)
        co_var_snps.append(snp)
        co_var_snps = set(co_var_snps)
        #for avail_result in result_group._f_iterNodes(classname='Table'):
        #   if set(avail_result._v_attrs.cofactors) == co_var_snps:
        #      raise Exception("There is already a result with the selected snps") 

        new_result_name = "SW_%s" % result_group._v_nchildren
        name = "%s_%s" % (analysis_method, new_result_name)

        import bisect
        import gwa
        if analysis_method not in ['lm', 'emmax', 'kw']:
            raise Exception('analysis method %s not supported' % analysis_method)
        if analysis_method == 'kw':
            analysis_method = 'emmax'
        progress_file_writer.update_progress_bar(progress=0.0, task_status='Loading phenotype data')
        phen_dict = self.get_phenotype_values(phen_name,dataset, transformation) #Load phenotype
        phend = pd.phenotype_data({1:{'values':phen_dict['mean_value'], 'ecotypes':map(str, phen_dict['ecotype']), 'name':phen_name}})
        phend.convert_to_averages()
        progress_file_writer.update_progress_bar(task_status='Loading genotype data')
        sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format='binary', min_mac=5) #Load SNPs data
        progress_file_writer.update_progress_bar(step=0.05, task_status='Coordinating genotype and phenotype data')
        sd.coordinate_w_phenotype_data(phend, 1)
        sd.filter_monomorphic_snps()
        phen_vals = phend.get_values(1)

        snps = sd.getSnps()
        positions = sd.getPositions()
        chromosomes = []
        progress_file_writer.set_step(0.03)
        for i, (s, c) in enumerate(itertools.izip(sd.snpsDataList, sd.chromosomes)):
            chromosomes.extend([c] * len(s.snps))
            progress_file_writer.update_progress_bar(task_status='Calculating MAFs and direct correlations for Chr %s/%s' %((i+1),len(sd.chromosomes)))
        maf_dict = sd.get_mafs()


        kwargs = {}
        if analysis_method == 'emmax':
            progress_file_writer.update_progress_bar(progress=0.40,task_status='Retrieving the kinship matrix')
            k = dp.load_kinship(call_method_id=75, data_format='binary', method='ibs', accessions=sd.accessions,
            scaled=True, min_mac=5, sd=sd)
            progress_file_writer.update_progress_bar(progress=0.42, task_status='Performing Step-Wise EMMAX')
            d = lm.emmax_step(phen_vals, sd, k, co_var_snps,progress_file_writer=progress_file_writer)
            progress_file_writer.update_progress_bar(0.95, 'Processing and saving results')
            res = d['res']
            stats_dict = d['stats']
        elif analysis_method == 'lm':
            res = lm.linear_model(snps, phen_vals)
        else:
            raise Exception('analysis method %s not supported' % analysis_method)

        if analysis_method in ['lm', 'emmax']:
            if 'betas' in res:
                betas = map(list, zip(*res['betas']))
            else:
                betas = [None, None]
            scores = map(lambda x:-math.log10(x), res['ps'])

            stats_dict['chr'] = snp[0]
            stats_dict['pos'] = snp[1]
            stats_dict['step'] = len(cofactors)
            cofactors.append(stats_dict)

            self.add_results(phen_name,dataset, analysis_method, name, chromosomes, positions, scores, maf_dict['marfs'],
                    maf_dict['mafs'], transformation=transformation,
                    genotype_var_perc=res['var_perc'], beta0=betas[0], beta1=betas[1],
                     cofactors=cofactors, result_name=new_result_name)
        print 'Done!'
        progress_file_writer.update_progress_bar(1.0, 'Done')
        return name
Exemple #16
0
    def perform_gwas(self, phen_name, dataset,transformation='raw', analysis_method='kw', call_method_id=75,
                     kinship_method='ibs', progress_file_writer=None):

        """
        Performs GWAS and updates the datastructure.
        """

        import bisect
        import gwa
        step_wise = False
        if analysis_method not in ['lm', 'emmax', 'kw']:
            raise Exception('analysis method %s not supported' % analysis_method)

        progress_file_writer.update_progress_bar(progress=0.0, task_status='Loading phenotype data')
        phen_dict = self.get_phenotype_values(phen_name,dataset, transformation) #Load phenotype
        phend = pd.phenotype_data({1:{'values':phen_dict['mean_value'], 'ecotypes':map(str, phen_dict['ecotype']), 'name':phen_name}})
        phend.convert_to_averages()
        progress_file_writer.update_progress_bar(task_status='Loading genotype data')
        sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format='binary', min_mac=5) #Load SNPs data
        progress_file_writer.update_progress_bar(step=0.05, task_status='Coordinating genotype and phenotype data')
        sd.coordinate_w_phenotype_data(phend, 1)
        progress_file_writer.update_progress_bar(progress=0.1,task_status='Filtering monomorphic SNPs')
        sd.filter_monomorphic_snps()
        phen_vals = phend.get_values(1)
        snps = sd.getSnps()
        positions = sd.getPositions()
        chromosomes = []
        progress_file_writer.set_step(0.03)
        for i, (s, c) in enumerate(itertools.izip(sd.snpsDataList, sd.chromosomes)):
            progress_file_writer.update_progress_bar(task_status='Calculating MAFs and direct correlations for Chr %s/%s' %((i+1),len(sd.chromosomes)))
            chromosomes.extend([c] * len(s.snps))
        maf_dict = sd.get_mafs()
		

        kwargs = {}
        if analysis_method == 'emmax':
            progress_file_writer.update_progress_bar(progress=0.40,task_status='Retrieving the kinship matrix')
            k = dp.load_kinship(call_method_id=75, data_format='binary', method='ibs', accessions=sd.accessions,
                                scaled=True, min_mac=5, sd=sd)
            progress_file_writer.update_progress_bar(progress=0.42, task_status='Performing EMMAX')
            d = lm.emmax_step(phen_vals, sd, k, [], progress_file_writer=progress_file_writer)
            progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results')
            res = d['res']
            stats_dict = d['stats']
        elif analysis_method == 'lm':
            progress_file_writer.update_progress_bar(progress=0.3, task_status='Performing LM')
            res = lm.linear_model(snps, phen_vals)
            progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results')
        elif analysis_method == 'kw':
            progress_file_writer.update_progress_bar(progress=0.7, task_status='Performing KW')
            kw_res = util.kruskal_wallis(snps, phen_vals)
            progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results')
            scores = map(lambda x:-math.log10(x), kw_res['ps'])
            self.add_results(phen_name, dataset,analysis_method, analysis_method, chromosomes, positions, scores, maf_dict['marfs'],
                    maf_dict['mafs'], transformation=transformation, statistics=kw_res['ds'])
        else:
            raise Exception('analysis method %s not supported' % analysis_method)

        if analysis_method in ['lm', 'emmax']:
            if 'betas' in res:
                betas = map(list, zip(*res['betas']))
            else:
                betas = [None, None]
            scores = map(lambda x:-math.log10(x), res['ps'])
            stats_dict['step'] = 0
            cofactors = [stats_dict]
            self.add_results(phen_name, dataset, analysis_method, analysis_method, chromosomes, positions, scores, maf_dict['marfs'],
                             maf_dict['mafs'], transformation=transformation,
                             genotype_var_perc=res['var_perc'], beta0=betas[0], beta1=betas[1],
                              cofactors=cofactors)
        progress_file_writer.update_progress_bar(progress=1.0, task_status='Done')
        print 'Done!'
        return analysis_method