def parse_phenotype_data(file_name='/tmp/test_phen.csv'): phen_dict = {} for pid in range(1, 7): phen_dict[pid] = { 'name': 'criteria_%d' % pid, 'ecotypes': [], 'values': [], 'transformation': None } et_set = set() with open(env.env['data_dir'] + 'phenot_HybridIncompatibilities.csv') as f: f.next() for line in f: l = map(str.strip, line.split(',')) et = '%s_%s' % (l[0], l[1]) if not et in et_set: et_set.add(et) for pid in range(1, 7): val = l[pid + 1] if val != 'NA': phen_dict[pid]['values'].append(int(val)) phen_dict[pid]['ecotypes'].append(et) for pid in range(1, 7): phen_dict[pid]['raw_values'] = phen_dict[pid]['values'] phend = pd.phenotype_data(phen_dict=phen_dict) phend.write_to_file(file_name)
def load_gene_expression_traits_3(temperature='10C'): import scipy as sp filename = env['home_dir'] + \ '/Projects/Data/rna_seq/expression_variance_stabilized_11_08_11/' + \ 'expr_%s_merged.csv' % temperature phen_dict = {} phen_i = 1 with open(filename) as f: header = (f.next().strip()).split(',') ets = map(lambda x: x[1:], header[1:]) for i in range(len(ets)): if int(ets[i]) == 2: ets[i] = '6932' elif int(ets[i]) == 3: ets[i] = '6980' if int(ets[i]) == 4: ets[i] = 'ALyr' elif int(ets[i]) == 5: ets[i] = 'ACap' print ets for l in f: line = (l.strip()).split(',') gene_name = line[0] vals = map(float, line[1:]) phen_dict[phen_i] = {'name':gene_name, 'ecotypes':ets, 'values':vals} phen_i += 1 phed = pd.phenotype_data(phen_dict) print 'Phenotype object constructed with %d phenotypes, now writing to phenotype file' % len(phen_dict) phed.write_to_file(env['phen_dir'] + 'rna_seq_vs_081411_%s.csv' % temperature)
def load_phentoype_file_duszynska(): fn1 = env['phen_dir'] + 'seed_size_2n.csv' fn2 = env['phen_dir'] + 'seed_size_3n_2x4.csv' fn3 = env['phen_dir'] + 'seed_size_3n_4x2.csv' fn4 = env['phen_dir'] + 'seed_size_spss.csv' fns = [fn1, fn2, fn3, fn4] phen_names = ['seed_size_2n', 'seed_size_3n_2x4', 'seed_size_3n_4x2', 'seed_size_spss'] phen_dict = {} for i, pn in enumerate(phen_names): phen_dict[i + 1] = {'name':pn } accs_list = [] for i, fn in enumerate(fns): f = open(fn, "r") acc_dict = pd.get_250K_accession_to_ecotype_dict() ecotypes = [] values = [] print f.next() for line in f: l = map(str.strip, line.split(',')) acc = l[0].lower() if not acc in acc_dict: print "(%s) is missing in dictionary" % (acc) else: ecotypes.append(acc_dict[acc][4]) values.append(float(l[1])) f.close() phen_dict[i + 1]['ecotypes'] = ecotypes phen_dict[i + 1]['values'] = values phed = pd.phenotype_data(phen_dict) phed.write_to_file(env['phen_dir'] + 'seed_size.csv')
def load_phentoype_file_dilkes(): filename = env['phen_dir'] + 'dilkes_metabolites.csv' f = open(filename, "r") print f.next() header = f.next() phen_names = map(str.strip, header.split(',')[2:]) print phen_names pids = range(len(phen_names)) accessions = [] phen_dict = {} for pid, name in zip(pids, phen_names): phen_dict[pid] = {'name':name, 'ecotypes':[], 'values':[]} for line in f: l = map(str.strip, line.split(',')) accessions.append(l[1].lower()) for i in pids: phen_dict[i]['values'].append(float(l[2 + i])) f.close() acc_dict = pd.get_250K_accession_to_ecotype_dict() # acc_dict['buckhorn'] = [0, 0, 0, 0, 7033] acc_dict['sakhdara'] = [0, 0, 0, 0, 6962] ecotypes = [] for acc in accessions: if not acc in acc_dict: print "%s is missing in dictionary" % acc else: ecotype = acc_dict[acc][4] ecotypes.append(ecotype) for pid, name in zip(pids, phen_names): phen_dict[pid]['ecotypes'] = ecotypes phed = pd.phenotype_data(phen_dict) phed.write_to_file(env['phen_dir'] + 'b_dilkes_metabolites.csv')
def load_duszynska_file4(): """ Loads the heterosis data. """ fn = env['home_dir'] + 'Projects/duszynska_data/seed_size_heterosis.csv' acc_dict = pd.get_250K_accession_to_ecotype_dict() phen_dict = {} name_dict = {'knox-18':'kno-18', 'knox-10':'kno-10', 'kas-1':'kas-2', 'pu-2-7':'pu2-7', 'cs22491':'n13', 'shahdara':'sha'} with open(fn) as f: ets = [] header = f.next() phen_names = map(str.strip, header.split(',')) et_indices = [0, 3, 6, 9] phen_names = [phen_names[1], phen_names[4], phen_names[7], phen_names[10]] for i, pn in zip([1, 2, 3, 4], phen_names): phen_dict[i] = {'name': pn, 'values':[], 'ecotypes':[]} for line in f: l = map(str.strip, line.split(',')) for e_i, pid in zip(et_indices, [1, 2, 3, 4]): acc = l[e_i].lower() if acc in name_dict: acc = name_dict[acc] if not acc in acc_dict: print "(%s) is missing in dictionary" % (acc) else: phen_dict[pid]['ecotypes'].append(acc_dict[acc][4]) phen_dict[pid]['values'].append(float(l[e_i + 1])) phed = pd.phenotype_data(phen_dict) phed.write_to_file(env['phen_dir'] + 'duszynska_heterosis_data.csv')
def load_duszynska_file2(): fn1 = env[ 'home_dir'] + 'Projects/duszynska_data/male_data_proportion_AN.csv' fn2 = env[ 'home_dir'] + 'Projects/duszynska_data/female_data_proportion_AN.csv' acc_dict = pd.get_250K_accession_to_ecotype_dict() phen_names = [] phen_dict = {} name_dict = { 'knox-18': 'kno-18', 'knox-10': 'kno-10', 'kas-1': 'kas-2', 'pu-2-7': 'pu2-7', 'cs22491': 'n13', 'shahdara': 'sha' } with open(fn1) as f: ets = [] header = f.next() phen_names = map(str.strip, header.split(',')[1:]) for i, pn in zip([1, 2, 3], phen_names): phen_dict[i] = {'name': 'male_' + pn, 'values': []} for line in f: l = map(str.strip, line.split(',')) acc = l[0].lower() if acc in name_dict: acc = name_dict[acc] if not acc in acc_dict: print "(%s) is missing in dictionary" % (acc) else: ets.append(acc_dict[acc][4]) for pid in [1, 2, 3]: phen_dict[pid]['values'].append(float(l[pid])) for pid in [1, 2, 3]: phen_dict[pid]['ecotypes'] = ets[:] with open(fn2) as f: ets = [] header = f.next() phen_names = map(str.strip, header.split(',')[1:]) for i, pn in zip([4, 5, 6], phen_names): phen_dict[i] = {'name': 'female_' + pn, 'values': []} for line in f: l = map(str.strip, line.split(',')) acc = l[0].lower() if acc in name_dict: acc = name_dict[acc] if not acc in acc_dict: print "(%s) is missing in dictionary" % (acc) else: ets.append(acc_dict[acc][4]) for pid in [4, 5, 6]: phen_dict[pid]['values'].append(float(l[pid - 3])) for pid in [4, 5, 6]: phen_dict[pid]['ecotypes'] = ets[:] phed = pd.phenotype_data(phen_dict) phed.write_to_file(env['phen_dir'] + 'duszynska_data.csv')
def load_skin_color_traits(): dir_prefix = env['home_dir'] + 'Projects/data/skin_eye_color/' #dir_prefix = env.env['home_dir'] + 'Projects/Data/Skin_color/' filename = dir_prefix + 'CV685-skin_eye_color.txt' d = { 1: { 'name': 'skin_color', 'ecotypes': [], 'values': [] }, 2: { 'name': 'eye_color', 'ecotypes': [], 'values': [] } } sc_vals = [] ec_vals = [] sc_iids = [] #individual IDs (ecotypes) with open(filename) as f: print f.next() for line in f: l = line.split() if int(float(l[2])) != -9: d[1]['values'].append(float(l[2])) d[1]['ecotypes'].append(l[1]) if int(float(l[3])) != -9: d[2]['values'].append(float(l[3])) d[2]['ecotypes'].append(l[1]) phed = pd.phenotype_data(d) phed.write_to_file(dir_prefix + 'phenotypes.csv') return phed
def load_gene_expression_traits(): filename = '/Users/bjarni.vilhjalmsson/Projects/Data/rna_seq/gene_expression_table_20110208.tsv' ecotypes = [] et_ids = [] import scipy as sp with open(filename, "r") as f: i = 0 for line in f: if line[0] != '#': break l = line.split('\t') if l[7] == '16C': ecotypes.append(l[5]) et_ids.append(i) i += 1 gene_ids = line.split('\t') print len(ecotypes), len(set(ecotypes)) print ecotypes et_dict = { 'Col-0': '6909', 'Col': '6909', 'Ler': '6932', 'Ws-0': '6980' } ets = [] for et in ecotypes: if et in et_dict: ets.append(et_dict[et]) else: ets.append(et) print ets phen_dict = {} num_const_phen = 0 phen_i = 1 for i, line in enumerate(f): #For each gene l = line.split() phen_name = l[0] phen_vals = map(float, l[1:]) phen_vals = [phen_vals[i] for i in et_ids] if len(phen_vals) != len(ets): raise Exception('Arrg') if len(sp.unique(phen_vals)) > 1: phen_dict[phen_i] = { 'name': phen_name, 'ecotypes': ets, 'values': phen_vals } phen_i += 1 else: num_const_phen += 1 print 'Total number of gene expressions was %d, of which %d were constant (removed), leaving %d gene expressions.' \ % ((phen_i - 1) + num_const_phen, num_const_phen, phen_i - 1) phed = pd.phenotype_data(phen_dict) phed.write_to_file(env['phen_dir'] + 'rna_seq_020811_16C.csv')
def load_phentoype_file_nc_resistance_3(): filename = "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/20dd5_330.csv" with open(filename) as f: line = map(str.strip, f.next().split(',')) phenotype_names = line[-1:] print phenotype_names phenotypes = [] accession_names = [] ecotypes = [] full_accession_names = [] for l in f: line = map(str.strip, l.split(',')) accession_names.append(line[1].lower()) ecotypes.append(line[0]) full_accession_names.append(line[5].lower()) phenotypes.append(line[4]) print accession_names acc_dict = pd.get_accession_to_ecotype_id_dict( accession_names) #+["n13","kno-10","kno-10","shahdara","nd-1"]) # acc_dict["cibc-5"] = 6908 # acc_dict["wa-1"] = 6978 # acc_dict["gu-0"] = 7149 # acc_dict['Rubezhnoe-1'] = 7323 print len(acc_dict), acc_dict ets = [] phen_vals = [] for acc1, acc2, et, pt in zip(accession_names, full_accession_names, ecotypes, phenotypes): if acc1 in acc_dict: ecotype = acc_dict[acc1] if str(ecotype) != et and et != 'NA': print "Ecotype mismatch.. %s, %s, %s, %s" % (unicode( acc1, "latin-1"), unicode(acc2, "latin-1"), et, ecotype) else: et = ecotype if et != 'NA' and et != '': ets.append(et) if not pt in ['R', 'S']: print pt phen_vals.append(0 if pt == 'R' else 1) print len(phen_vals) phen_dict = { 1: { 'name': 'resistance_20dd5', 'ecotypes': ets, 'values': phen_vals } } phed = pd.phenotype_data(phen_dict) phed.write_to_file('resistance_20dd5.csv', ',')
def load_phentoype_file_riha(): filename = env['phen_dir'] + 'telomere_lengths_192_raw.csv' f = open(filename, "r") phen_name = 'telomere_length' accession_names = [] accession_ids = [] parent_ids = [] phen_vals = [] print f.next() for line in f: l = map(str.strip, line.split(',')) parent_ids.append(l[0]) acc_l = l[1].split() acc_name = acc_l[0] if len(acc_l) > 1: acc_id = acc_l[1] else: acc_id = '' accession_names.append(acc_name.lower()) accession_ids.append(acc_id) phen_vals.append(float(l[2])) f.close() print accession_names acc_dict = pd.get_250K_accession_to_ecotype_dict() acc_dict['buckhorn'] = [0, 0, 0, 0, 7033] acc_dict['shahdara'] = [0, 0, 0, 0, 6962] ecotypes = [] uncertain_list = [] new_phen_vals = [] for acc, par_id, acc_id, phen_val in zip(accession_names, parent_ids, accession_ids, phen_vals): if not acc in acc_dict: print "(%s, %s, %s) is missing in dictionary" % (acc, par_id, acc_id) else: ecotype = acc_dict[acc][4] ecotypes.append(ecotype) new_phen_vals.append(phen_val) print len(set(accession_names)), len(set(ecotypes)) phen_dict = { 1: { 'name': phen_name, 'ecotypes': ecotypes, 'values': new_phen_vals } } phed = pd.phenotype_data(phen_dict) phed.write_to_file(env['phen_dir'] + 'telomere_lengths_192.csv')
def load_total_expressions(): import scipy as sp filename = env['home_dir'] + \ 'Projects/Data/rna_seq/expression_matrix_upload_data_5_10_2011/mapping_files/' + \ 'all_expression_matrix_5_09_2011_flagged_removed_libID_bioreps_combined_cov_filter.txt' print 'Loading file:', filename ets = {'10C': [], '16C': []} i_map = {} expressions_dict = {} with open(filename, "r") as f: i = 0 line = f.next() l = map(str.strip, line.split()) for i in range(2, len(l)): e_t_l = l[i].split('_') et = e_t_l[0][1:] t = e_t_l[1] i_map[i] = t ets[t].append(et) line = f.next() l = map(str.strip, line.split()) gene_name = l[0] gene_type = l[1] d = {'gene_type': gene_type, '10C': [], '16C': []} for i in range(2, len(l)): t = i_map[i] val = float(l[i]) d[t].append(val) expressions_dict[gene_name] = d print 'File was parsed, now constructing phenotype object' phen_dict = {} phen_i = 1 for t in ['10C', '16C']: values = expressions_dict['GENE'][t] phen_dict[phen_i] = { 'name': 'total_expression_%s' % t, 'ecotypes': ets[t], 'values': values } phen_i += 1 phed = pd.phenotype_data(phen_dict) print 'Phenotype object constructed with %d phenotypes, now writing to phenotype file' % len( phen_dict) phed.write_to_file(env['phen_dir'] + 'rna_seq_061611_total.csv')
def load_genome_size_factors(): phen_file = env['phen_dir'] + 'measures_gene_model.tsv' phen_dict = {} ets = [] with open(phen_file) as f: line = f.next() names = line.split()[1:] for i, name in enumerate(names): phen_dict[i + 1] = { 'values': [], 'name': name, 'transformation': None } for line in f: l = line.split() ets.append(l[0]) for i, v in enumerate(l[1:]): phen_dict[i + 1]['values'].append(float(v)) for i in phen_dict: phen_dict[i]['ecotypes'] = ets[:] phen_dict[i]['raw_values'] = phen_dict[i]['values'] phed = pd.phenotype_data(phen_dict) phed.write_to_file(env['phen_dir'] + 'measures_gene_model.csv')
def parse_NFBC_traits(): phen_dict = {} height_file = env['data_dir'] + 'NFBC_20091001/pheno.Height' with open(height_file) as f: f.next() ets = [] values = [] for line in f: l = line.split() ets.append(int(l[0])) values.append(float(l[2])) phen_dict[1] = {'name':'height', 'ecotypes':ets, 'values':values} metabolite_file = env['data_dir'] + 'NFBC_20091001/MetaboPheno.txt' with open(metabolite_file) as f: line = f.next() l = map(str.strip, line.split()) phen_names = l[2:] for i, pname in enumerate(phen_names): phen_dict[i + 2] = {'name':pname, 'ecotypes':[], 'values':[]} ets = [] values = [] for line in f: l = line.split() et = int(l[0]) for i, v in enumerate(l[2:]): try: val = int(v) except Exception: val = float(v) if val != -9 and type(val) != int: phen_dict[i + 2]['ecotypes'].append(et) phen_dict[i + 2]['values'].append(val) phed = pd.phenotype_data(phen_dict) phed.write_to_file(env['data_dir'] + 'NFBC_20091001/phenotype.scv') return phed
def load_gene_expression_traits_2(): import scipy as sp filename = env['home_dir'] + \ 'Projects/Data/rna_seq/expression_matrices_upload_8_01_2011/' + \ 'expression_matrix_wSNPmap_7_29_2011-bioreps_combined_cov_filter-normalized.txt' print 'Loading file:', filename ets = {'10C':[], '16C':[]} i_map = {} expressions_dict = {} with open(filename, "r") as f: i = 0 line = f.next() l = map(str.strip, line.split()) for i in range(2, len(l)): e_t_l = l[i].split('_') et = e_t_l[0][1:] t = e_t_l[1] i_map[i] = t if int(et) == 2: et = '6932' elif int(et) == 3: et = '6980' if int(et) == 4: et = 'ALyr' elif int(et) == 5: et = 'ACap' ets[t].append(et) for line in f: l = map(str.strip, line.split()) gene_name = l[0] gene_type = l[1] d = {'gene_type':gene_type, '10C':[], '16C':[]} for i in range(2, len(l)): t = i_map[i] val = float(l[i]) d[t].append(val) expressions_dict[gene_name] = d print 'File was parsed, now constructing phenotype object' phen_dict_10C = {} phen_dict_16C = {} phen_i = 1 for gene_name in expressions_dict: values = expressions_dict[gene_name]['10C'] if len(sp.unique(values)) > 1: phen_dict_10C[phen_i] = {'name':gene_name, 'ecotypes':ets['10C'], 'values':expressions_dict[gene_name]['10C']} phen_i += 1 phen_i = 1 for gene_name in expressions_dict: values = expressions_dict[gene_name]['16C'] if len(sp.unique(values)) > 1: phen_dict_16C[phen_i] = {'name':gene_name, 'ecotypes':ets['16C'], 'values':expressions_dict[gene_name]['16C']} phen_i += 1 phed_10C = pd.phenotype_data(phen_dict_10C) print 'Phenotype object constructed with %d phenotypes, now writing to phenotype file' % len(phen_dict_10C) phed_10C.write_to_file(env['phen_dir'] + 'rna_seq_081411_10C.csv') phed_16C = pd.phenotype_data(phen_dict_16C) print 'Phenotype object constructed with %d phenotypes, now writing to phenotype file' % len(phen_dict_16C) phed_16C.write_to_file(env['phen_dir'] + 'rna_seq_081411_16C.csv')
def perform_stepwise_gwas(self, phen_name, dataset, transformation, analysis_method, result_name, chromosome, position, call_method_id=75, kinship_method='ibs',progress_file_writer=None): """ Performs GWAS and updates the datastructure. """ #if analysis_method not in ['emmax','lm']: # raise Exception("Step-Wise GWAS only possible with emmax or LM") snp = ((int(chromosome), int(position))) result_group = self.h5file.getNode('/phenotypes/%s/%s/%s/%s' % (phen_name, dataset, transformation, analysis_method)) result = result_group._f_getChild(result_name) cofactors = result._v_attrs.cofactors[:] co_var_snps = [(int(factors['chr']), int(factors['pos'])) for factors in cofactors if 'chr' in factors and 'pos' in factors] if snp in co_var_snps: raise Exception('The SNP %s,%s is already in the result' % chromosome, position) co_var_snps.append(snp) co_var_snps = set(co_var_snps) #for avail_result in result_group._f_iterNodes(classname='Table'): # if set(avail_result._v_attrs.cofactors) == co_var_snps: # raise Exception("There is already a result with the selected snps") new_result_name = "SW_%s" % result_group._v_nchildren name = "%s_%s" % (analysis_method, new_result_name) import bisect import gwa if analysis_method not in ['lm', 'emmax', 'kw']: raise Exception('analysis method %s not supported' % analysis_method) if analysis_method == 'kw': analysis_method = 'emmax' progress_file_writer.update_progress_bar(progress=0.0, task_status='Loading phenotype data') phen_dict = self.get_phenotype_values(phen_name,dataset, transformation) #Load phenotype phend = pd.phenotype_data({1:{'values':phen_dict['mean_value'], 'ecotypes':map(str, phen_dict['ecotype']), 'name':phen_name}}) phend.convert_to_averages() progress_file_writer.update_progress_bar(task_status='Loading genotype data') sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format='binary', min_mac=5) #Load SNPs data progress_file_writer.update_progress_bar(step=0.05, task_status='Coordinating genotype and phenotype data') sd.coordinate_w_phenotype_data(phend, 1) sd.filter_monomorphic_snps() phen_vals = phend.get_values(1) snps = sd.getSnps() positions = sd.getPositions() chromosomes = [] progress_file_writer.set_step(0.03) for i, (s, c) in enumerate(itertools.izip(sd.snpsDataList, sd.chromosomes)): chromosomes.extend([c] * len(s.snps)) progress_file_writer.update_progress_bar(task_status='Calculating MAFs and direct correlations for Chr %s/%s' %((i+1),len(sd.chromosomes))) maf_dict = sd.get_mafs() kwargs = {} if analysis_method == 'emmax': progress_file_writer.update_progress_bar(progress=0.40,task_status='Retrieving the kinship matrix') k = dp.load_kinship(call_method_id=75, data_format='binary', method='ibs', accessions=sd.accessions, scaled=True, min_mac=5, sd=sd) progress_file_writer.update_progress_bar(progress=0.42, task_status='Performing Step-Wise EMMAX') d = lm.emmax_step(phen_vals, sd, k, co_var_snps,progress_file_writer=progress_file_writer) progress_file_writer.update_progress_bar(0.95, 'Processing and saving results') res = d['res'] stats_dict = d['stats'] elif analysis_method == 'lm': res = lm.linear_model(snps, phen_vals) else: raise Exception('analysis method %s not supported' % analysis_method) if analysis_method in ['lm', 'emmax']: if 'betas' in res: betas = map(list, zip(*res['betas'])) else: betas = [None, None] scores = map(lambda x:-math.log10(x), res['ps']) stats_dict['chr'] = snp[0] stats_dict['pos'] = snp[1] stats_dict['step'] = len(cofactors) cofactors.append(stats_dict) self.add_results(phen_name,dataset, analysis_method, name, chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], transformation=transformation, genotype_var_perc=res['var_perc'], beta0=betas[0], beta1=betas[1], cofactors=cofactors, result_name=new_result_name) print 'Done!' progress_file_writer.update_progress_bar(1.0, 'Done') return name
def perform_gwas(self, phen_name, dataset,transformation='raw', analysis_method='kw', call_method_id=75, kinship_method='ibs', progress_file_writer=None): """ Performs GWAS and updates the datastructure. """ import bisect import gwa step_wise = False if analysis_method not in ['lm', 'emmax', 'kw']: raise Exception('analysis method %s not supported' % analysis_method) progress_file_writer.update_progress_bar(progress=0.0, task_status='Loading phenotype data') phen_dict = self.get_phenotype_values(phen_name,dataset, transformation) #Load phenotype phend = pd.phenotype_data({1:{'values':phen_dict['mean_value'], 'ecotypes':map(str, phen_dict['ecotype']), 'name':phen_name}}) phend.convert_to_averages() progress_file_writer.update_progress_bar(task_status='Loading genotype data') sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format='binary', min_mac=5) #Load SNPs data progress_file_writer.update_progress_bar(step=0.05, task_status='Coordinating genotype and phenotype data') sd.coordinate_w_phenotype_data(phend, 1) progress_file_writer.update_progress_bar(progress=0.1,task_status='Filtering monomorphic SNPs') sd.filter_monomorphic_snps() phen_vals = phend.get_values(1) snps = sd.getSnps() positions = sd.getPositions() chromosomes = [] progress_file_writer.set_step(0.03) for i, (s, c) in enumerate(itertools.izip(sd.snpsDataList, sd.chromosomes)): progress_file_writer.update_progress_bar(task_status='Calculating MAFs and direct correlations for Chr %s/%s' %((i+1),len(sd.chromosomes))) chromosomes.extend([c] * len(s.snps)) maf_dict = sd.get_mafs() kwargs = {} if analysis_method == 'emmax': progress_file_writer.update_progress_bar(progress=0.40,task_status='Retrieving the kinship matrix') k = dp.load_kinship(call_method_id=75, data_format='binary', method='ibs', accessions=sd.accessions, scaled=True, min_mac=5, sd=sd) progress_file_writer.update_progress_bar(progress=0.42, task_status='Performing EMMAX') d = lm.emmax_step(phen_vals, sd, k, [], progress_file_writer=progress_file_writer) progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results') res = d['res'] stats_dict = d['stats'] elif analysis_method == 'lm': progress_file_writer.update_progress_bar(progress=0.3, task_status='Performing LM') res = lm.linear_model(snps, phen_vals) progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results') elif analysis_method == 'kw': progress_file_writer.update_progress_bar(progress=0.7, task_status='Performing KW') kw_res = util.kruskal_wallis(snps, phen_vals) progress_file_writer.update_progress_bar(progress=0.95, task_status='Processing and saving results') scores = map(lambda x:-math.log10(x), kw_res['ps']) self.add_results(phen_name, dataset,analysis_method, analysis_method, chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], transformation=transformation, statistics=kw_res['ds']) else: raise Exception('analysis method %s not supported' % analysis_method) if analysis_method in ['lm', 'emmax']: if 'betas' in res: betas = map(list, zip(*res['betas'])) else: betas = [None, None] scores = map(lambda x:-math.log10(x), res['ps']) stats_dict['step'] = 0 cofactors = [stats_dict] self.add_results(phen_name, dataset, analysis_method, analysis_method, chromosomes, positions, scores, maf_dict['marfs'], maf_dict['mafs'], transformation=transformation, genotype_var_perc=res['var_perc'], beta0=betas[0], beta1=betas[1], cofactors=cofactors) progress_file_writer.update_progress_bar(progress=1.0, task_status='Done') print 'Done!' return analysis_method