def load_a_thaliana_genotypes(): """ Loads A. thaliana genotypes (Horton et al., 2012) and returns a snps_data object """ import dataParsers as dp sd = dp.parse_snp_data('at_data/all_chromosomes_binary.csv') return sd
def _insert_markers_into_db_(): import bisect, dbutils sd_192 = dataParsers.parse_snp_data( '/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_192_043009.csv') sd_t57 = dataParsers.parse_snp_data( '/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_merged_2010_250K_w_FLC_seq.csv' ) cpl_192 = sd_192.getChrPosList() cpsl_t57 = sd_t57.getChrPosSNPList() found_count = 0 conn = dbutils.connect_to_papaya() cursor = conn.cursor() for c, p, snp in cpsl_t57: if c < 5: continue i = bisect.bisect(cpl_192, (c, p)) if not cpl_192[i - 1] == (c, p): #Check if the SNP is in the DB. alleles = list(set(snp)) sql_statement = "SELECT id FROM stock_250k.snps WHERE chromosome=%d AND position=%d AND \ (allele1='%s' OR allele1='%s');" % (c, p, alleles[0], alleles[1]) print sql_statement found = False num_rows = int(cursor.execute(sql_statement)) row = cursor.fetchone() if row: print row found = True if not found: #Insert SNP in DB. snp_name = str(c) + "_" + str( p) + "_" + alleles[0] + "_" + alleles[1] sql_statement = "INSERT INTO stock_250k.snps (name, chromosome, position, allele1, allele2)\ VALUES ('%s',%d,%d,'%s','%s');" % (snp_name, c, p, alleles[0], alleles[1]) print sql_statement try: cursor.execute(sql_statement) print "Committing transaction (making changes permanent)." conn.commit() except: print "insert failed... moving on" #Close connection cursor.close() conn.close()
def generate_384_snps_illumina_file(): #sd = dp.parse_snp_data("/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_data_t43_081009.csv") sd = dp.parse_snp_data( "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_192_043009.csv") locus_names = [] locus_sequences = [] allele_1s = [] allele_2s = [] chromosomes = [] positions = [] col_alleles = [] for ci, chromosome in enumerate([1, 2, 3, 4, 5]): col = sequences.get_col_sequence(chromosome) seq_len = len(col.seq) snpsd = sd.snpsDataList[ci] for i, pos in enumerate(snpsd.positions): start_pos = max(0, pos - 61) end_pos = min(seq_len, pos + 60) snp = list(set(snpsd.snps[i])) if len(snp) == 2 and pos < len(col.seq) and '-' not in snp: col_allele = col.seq[pos - 1] if col_allele == str(snp[0]): other_allele = str(snp[1]) else: other_allele = str(snp[0]) if col_allele != str(snp[1]): raise Exception allele_1s.append(col.seq[pos - 3:pos + 2]) allele_2s.append(col.seq[pos - 3:pos - 1] + other_allele + col.seq[pos:pos + 2]) snp_str = '[' + str(snp[0]) + '/' + str(snp[1]) + ']' local_seq = col.seq[start_pos:pos - 1] + snp_str + col.seq[pos:end_pos] locus_names.append('c' + str(chromosome) + '_p' + str(pos)) positions.append(pos) locus_sequences.append(local_seq) chromosomes.append(chromosome) if not col.seq[pos - 1] in snp: print col.seq[pos - 1], snp, chromosome, pos col_alleles.append(col.seq[pos - 1]) import csv w = csv.writer(open("/Users/bjarnivilhjalmsson/tmp/test.csv", 'w')) # w.writerow(['Locus_Name','Target_Type','Sequence','Chromosome','Coordinate','Genome_Build_Version', # 'Source','Source_Version','Sequence_Orientation','Plus_Minus']) # w.writerow(['Chromosome','Coordinate','Allele_1','Allele_2','Genome_Build_Version','Sequence_Orientation','Plus_Minus']) w.writerow([ 'Chromosome', 'Coordinate', 'Allele_1', 'Allele_2', 'Genome_Build_Version', 'Sequence_Orientation', 'Plus_Minus' ]) # for (ln,ls,c,p) in zip(locus_names,locus_sequences,chromosomes,positions): # w.writerow([ln,'SNP',ls,c,p,'TAIR8','TAIR','8','Forward','Plus']) for (ln, a1, a2, c, p) in zip(locus_names, allele_1s, allele_2s, chromosomes, positions): w.writerow([c, p, a1, a2, 'TAIR8', 'Forward']) print locus_sequences[100:110] print col_alleles[100:110]
def _get_genotype_data_(p_dict): if p_dict['data_file']: sd = dataParsers.parse_snp_data(p_dict['data_file'] , format=p_dict['data_format'], filter=p_dict['debug_filter']) else: cm_id = p_dict['call_method_id'] df = p_dict['data_format'] #df = df if not cm_id in [78, 79] else 'diploid_int' sd = dataParsers.load_snps_call_method(p_dict['call_method_id'], data_format=df, debug_filter=p_dict['debug_filter']) return sd
def _insert_merged_data_in_db_(): """ Ad hoc, to fix a problem.. """ sd_t54 = dataParsers.parse_snp_data( '/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_t54.csv', filter=0.001) d = {} for eid, aid in zip(sd_t54.accessions, sd_t54.array_ids): d[eid] = aid sd_t57 = dataParsers.parse_snp_data( '/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_merged_2010_250K_w_FLC_seq.csv' ) aids = [d[eid] for eid in sd_t57.accessions] # for sd in sd_t57.snpsDataList: # sd.arrayIds=aids sd_t57.arrayIds = aids sd_t57.write_to_file_yu_format( '/Users/bjarnivilhjalmsson/Projects/Data/250k/call_method_57.tsv')
def write_simple_toomaijan_file( filename="/Users/bjarnivilhjalmsson/tmp/test.csv", window=25): sd = dp.parse_snp_data( "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_t54.csv") locus_names = [] locus_sequences = [] allele_1s = [] allele_2s = [] chromosomes = [] positions = [] col_alleles = [] for ci, chromosome in enumerate([1, 2, 3, 4, 5]): col = sequences.get_col_sequence(chromosome) seq_len = len(col.seq) snpsd = sd.snpsDataList[ci] for i, pos in enumerate(snpsd.positions): start_pos = max(0, pos - window - 1) end_pos = min(seq_len, pos + window) snp = list(set(snpsd.snps[i])) if len(snp) == 2 and pos < len(col.seq) and '-' not in snp: col_allele = col.seq[pos - 1] if col_allele == str(snp[0]): other_allele = str(snp[1]) else: other_allele = str(snp[0]) if col_allele != str(snp[1]): raise Exception allele_1s.append(col.seq[pos - 3:pos + 2]) allele_2s.append(col.seq[pos - 3:pos - 1] + other_allele + col.seq[pos:pos + 2]) snp_str = '[' + col_allele + '/' + other_allele + ']' local_seq = col.seq[start_pos:pos - 1] + snp_str + col.seq[pos:end_pos] locus_names.append('c' + str(chromosome) + '_p' + str(pos)) positions.append(pos) locus_sequences.append(local_seq) chromosomes.append(chromosome) if not col.seq[pos - 1] in snp: print col.seq[pos - 1], snp, chromosome, pos col_alleles.append(col.seq[pos - 1]) import csv w = csv.writer(open(filename, 'w')) w.writerow([ 'Chromosome', 'Coordinate', 'Sequence', 'Genome_Build_Version', 'Sequence_Orientation' ]) for (ls, c, p) in zip(locus_sequences, chromosomes, positions): w.writerow([c, p, ls, 'TAIR8', 'Forward']) print locus_sequences[100:110] print col_alleles[100:110]
def lotus_data_analysis(phenotype_id=1, result_files_prefix='/Users/bjarnivilhjalmsson/Dropbox/Cloud_folder/tmp/lmm_results', manhattan_plot_file='/Users/bjarnivilhjalmsson/Dropbox/Cloud_folder/tmp/lmm_manhattan.png', qq_plot_file_prefix='/Users/bjarnivilhjalmsson/Dropbox/Cloud_folder/tmp/lmm_qq'): """ Lotus GWAS (data from Stig U Andersen) """ import linear_models as lm import kinship import gwaResults as gr import dataParsers as dp import phenotypeData as pd # Load genotypes print 'Parsing genotypes' sd = dp.parse_snp_data( '/Users/bjarnivilhjalmsson/Dropbox/Lotus_GWAS/20140603_NonRep.run2.vcf.matrix.ordered.csv') # Load phenotypes print 'Parsing phenotypes' phend = pd.parse_phenotype_file( '/Users/bjarnivilhjalmsson/Dropbox/Lotus_GWAS/141007_FT_portal_upd.csv') print 'Box-cox' phend.box_cox_transform(1) # Coordinate phenotype of interest and genotypes. This filters the genotypes and # phenotypes, leaving only accessions (individuals) which overlap between both, # and SNPs that are polymorphic in the resulting subset. print 'Coordinating data' sd.coordinate_w_phenotype_data(phend, phenotype_id) # Calculate kinship (IBS/IBD) # print 'Calculating kinship' # K = kinship.calc_ibd_kinship(sd.get_snps()) # print K # Perform mixed model GWAS print 'Performing mixed model GWAS' # mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K) # mlmm_results = lm.mlmm(phend.get_values(phenotype_id), K, sd=sd, # num_steps=10, file_prefix=result_files_prefix, # save_pvals=True, pval_file_prefix=result_files_prefix) lg_results = lm.local_vs_global_mm_scan(phend.get_values(phenotype_id), sd, file_prefix='/Users/bjarnivilhjalmsson/Dropbox/Cloud_folder/tmp/lotus_FT_loc_glob_0.1Mb', window_size=100000, jump_size=50000, kinship_method='ibd', global_k=None) # # Construct a results object print 'Processing results'
def _impute_FLC_192_(): phed = pd.readPhenotypeFile( "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv" ) d250k_file = env.home_dir + "Projects/Data/250k/250K_192_043009.csv" d250k_sd = dataParsers.parse_snp_data(d250k_file) d250k_sd.filter_accessions(phed.accessions) d250k_sd.filter_maf_snps(0.05) seq_snpsd = dataParsers.parseCSVData( data_dir + "/flc_seqs_aln_imputed_snps_012710.csv") seq_snpsd.onlyBinarySnps() d250k_sd.snpsDataList[4].compareWith(seq_snpsd) d250k_sd.snpsDataList[4].merge_data(seq_snpsd)
def removing_imputed_snps(): from bisect import bisect chr_pos_list, quality_scores = remove_overlapping_snps() sd = dp.parse_snp_data( "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_t52.csv") sd.filter_na_snps() sd_chr_pos = sd.getChrPosList() new_qs = [] new_chr_pos = [] for i, (chr, pos) in enumerate(chr_pos_list): j = bisect(sd_chr_pos, (chr, pos)) if sd_chr_pos[j - 1] != (chr, pos): if quality_scores[i] > 0.8: new_chr_pos.append((chr, pos)) new_qs.append(quality_scores[i]) print len(new_chr_pos), len(new_qs) return new_chr_pos, new_qs
def remove_overlapping_snps(): from bisect import bisect #loading perlegen data. import dataParsers as dp sd = dp.parse_snp_data( "/Users/bjarnivilhjalmsson/Projects/Data/perlegen/perlegen_011609.csv") perl_chr_pos = sd.getChrPosList() chr_pos_list, quality_scores = load_illumina_results() in_perlegen = [] nearby_snp_counts = [] for i, (chr, pos) in enumerate(chr_pos_list): j = bisect(perl_chr_pos, (chr, pos)) if perl_chr_pos[j - 1] != (chr, pos): in_perlegen.append(False) else: in_perlegen.append(True) k = j - 2 (n_chr, n_pos) = perl_chr_pos[k] n_count = 0 while pos - n_pos < 61 and n_chr == chr: n_count += 1 k -= 1 (n_chr, n_pos) = perl_chr_pos[k] k = j (n_chr, n_pos) = perl_chr_pos[k] while n_pos - pos < 61 and n_chr == chr: n_count += 1 k += 1 (n_chr, n_pos) = perl_chr_pos[k] nearby_snp_counts.append(n_count) if i % (len(chr_pos_list) / 10) == 0: print '%d%% done.' % (((i + 1.0) / len(chr_pos_list)) * 100) qc = zip(nearby_snp_counts, chr_pos_list, in_perlegen, quality_scores) qc.sort() k = bisect(qc, (1, (0, 0), False, 0)) good_snp_chr_pos = [] good_q_scores = [] for n_count, chr_pos, in_perlegen, q_score in qc[:k]: if n_count == 0 and in_perlegen: good_snp_chr_pos.append(chr_pos) good_q_scores.append(q_score) print len(good_snp_chr_pos) return good_snp_chr_pos, good_q_scores
def lotus_mixed_model_gwas(phenotype_id=4, phen_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/20181113_136LjAccessionData.csv', gt_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/all_chromosomes_binary.csv', pvalue_file='mm_results.pvals', manhattan_plot_file='mm_manhattan.png', qq_plot_file_prefix='mm_qq'): """ Perform mixed model (EMMAX) GWAS for Lotus data """ import linear_models as lm import kinship import gwaResults as gr import dataParsers as dp # Load genotypes sd = dp.parse_snp_data(gt_file) # Load phenotypes import phenotypeData as pd phend = pd.parse_phenotype_file(phen_file, with_db_ids=False) # Coordinate phenotype of interest and genotypes. This filters the genotypes and # phenotypes, leaving only accessions (individuals) which overlap between both, # and SNPs that are polymorphic in the resulting subset. sd.coordinate_w_phenotype_data(phend, phenotype_id) # Calculate kinship (IBS) K = kinship.calc_ibs_kinship(sd.get_snps()) # Perform mixed model GWAS mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K) # Construct a results object res = gr.Result(scores=mm_results['ps'], snps_data=sd) # Save p-values to file res.write_to_file(pvalue_file) # Plot Manhattan plot res.plot_manhattan(png_file=manhattan_plot_file, percentile=90, plot_bonferroni=True, neg_log_transform=True) # Plot a QQ-plot res.plot_qq(qq_plot_file_prefix)
def _generate_250K_2010_FLC_data_(impute=True): """ Create a combined version of 250K, overlapping with the FLC phenotypes. Then merge with 2010 data (including indels). Then merge with FLC sequences. Impute missing SNPs. write to file. """ import phenotypeData as pd import env phed = pd.readPhenotypeFile( "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv" ) d2010_file = env.home_dir + "Projects/Data/2010/2010_imputed_012610.csv" d2010_sd = dataParsers.parse_snp_data(d2010_file, id="2010_data") d2010_sd.filter_accessions(phed.accessions) d2010_sd.filter_na_snps() d2010_sd.filter_maf_snps(0.05) #d250k_file = env.home_dir+"Projects/Data/250k/250K_t54.csv" d250k_file = env.home_dir + "Projects/Data/250k/250K_192_043009.csv" d250k_sd = dataParsers.parse_snp_data(d250k_file) d250k_sd.filter_accessions(phed.accessions) d250k_sd.filter_maf_snps(0.05) d250k_sd.merge_snps_data(d2010_sd) d250k_sd.filter_na_accessions() d250k_sd.filter_na_snps(0.7) d250k_sd.filter_monomorphic_snps() ref_seq_name = "raw_ref_col-0" ref_start = 3170501 ref_chr = 5 seq_file = env.home_dir + "Projects/FLC_analysis/flc_seqs_aln_merged_050410.fasta" ad = sequences.readFastaAlignment(seq_file, ref_seq_name=ref_seq_name, ref_start=ref_start, ref_chr=ref_chr, alignment_type="muscle", ref_direction=1) # ref_start = 3170500 # ad2 = sequences.readFastaAlignment(seq_file,ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) # ref_start = 3170502 # ad3 = sequences.readFastaAlignment(seq_file,ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) pdb.set_trace() r = ad.get_snps(type=0) seq_snpsd1 = r['snpsd'] seq_snpsd1.merge_data(r['indels'], error_threshold=0.0) # r2 = ad2.get_snps(type=0) # seq_snpsd2 = r2['snpsd'] # seq_snpsd2.merge_data(r2['indels'],error_threshold=0.0) # # r3 = ad3.get_snps(type=0) # seq_snpsd3 = r3['snpsd'] # seq_snpsd3.merge_data(r3['indels'],error_threshold=0.0) print "Now merging data.." d250k_sd.snpsDataList[4].compareWith(seq_snpsd1) # d250k_sd.snpsDataList[4].compareWith(seq_snpsd2) # d250k_sd.snpsDataList[4].compareWith(seq_snpsd3) d250k_sd.snpsDataList[4].merge_data(seq_snpsd1, union_accessions=False) d250k_sd.filter_na_accessions() d250k_sd.filter_na_snps(0.7) d250k_sd.filter_monomorphic_snps() d250k_sd.snpsDataList[4].impute_data() d250k_sd.writeToFile("/tmp/test.csv") print "YEAH!"
def load_phentoype_file_wilczek(): filename = "/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/PhenotypeDataWilczek.csv" f = open(filename, "r") reader = csv.reader(f) phenotype_names = reader.next()[2:] for i in range(len(phenotype_names)): phenotype_names[i] = phenotype_names[i].replace(" ", "_") print phenotype_names accession_names = [] accession_ID = [] for row in reader: accession_names.append(row[1].split()[0].lower()) accession_ID.append(row[0]) f.close() print accession_names acc_dict = pd._getAccessionToEcotypeIdDict_(accession_names)#+["n13","kno-10","kno-10","shahdara","nd-1"]) acc_dict["cibc-5"] = 6908 acc_dict["wa-1"] = 6978 acc_dict["gu-0"] = 7149 acc_dict['Rubezhnoe-1'] = 7323 print len(acc_dict), acc_dict import env d250k_file = env.home_dir + "Projects/Data/250k/250K_t54.csv" import dataParsers d250k_sd = dataParsers.parse_snp_data(d250k_file) ecotypes = [] key_file = "/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/unique_id_to_ecotype_id.csv" f = open(key_file, "w") f.write("unique_id, accession_name, ecotype_id, in_250k_data\n") for acc, acc_id in zip(accession_names, accession_ID): if not acc in acc_dict or acc_id == 'karl27' or acc_id == 'karl05': print "(%s, %s) is missing" % (acc, acc_id) else: ecotype = acc_dict[acc] ecotypes.append(ecotype) f.write("%s,%s,%s,%s\n" % (acc_id, acc, str(ecotype), str(str(ecotype) in d250k_sd.accessions))) f.close() #phenotype_names = reader.next()[2:] phenotype_indices = range(2, len(phenotype_names) + 2) phenotypes = [] #[acc_id][phenotype_name] f = open(filename, "r") reader = csv.reader(f) reader.next() for row in reader: #print row if row[1].split()[0].lower() in acc_dict: phen_vals = [] for pv in row[2:]: if pv == "": pv = 'NA' else: pv = float(pv) phen_vals.append(pv) phenotypes.append(phen_vals) else: print "Missing:", row[1] phed = pd.PhenotypeData(ecotypes, phenotype_names, phenotypes) phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/phen_wilzcek_050710.tsv", delimiter='\t') phed.writeToFile("/Users/bjarnivilhjalmsson/Projects/Amity_Wilczek/phen_wilzcek_050710.csv", delimiter=',')
def analyzeSNPs(): import KW, phenotype_parsers, phenotypeData import Emma result_id = "filtered_imputed" data_dir = "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/" #ref_seq_name = "2010_Col-0" ref_seq_name = "raw_ref_col-0" ref_start = 3170501 ref_chr = 5 #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_edited_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) #ad = sequences.readFastaAlignment(data_dir+"flc_seqs_aln_merged_011810.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) #r = ad.get_snps(type=1) #seq_snpsd = r['snpsd'] #seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA') #seq_snpsd.onlyBinarySnps() #i_snpsd = r['indels'] #print indels #i_snpsd = i_snpsd.getSnpsData(missingVal='NA') #print zip(i_snpsd.positions, i_snpsd.snps) #print i_snpsd.accessionsl seq_snpsd = dataParsers.parseCSVData( data_dir + "/flc_seqs_aln_imputed_snps_012510.csv")[0] seq_snpsd = seq_snpsd.getSnpsData(missingVal='NA') # d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_073009.csv" d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_imputed_012610.csv" d2010_sd = dataParsers.parse_snp_data(d2010_file, id="2010_data") # d2010_sd.filter_na_accessions() d2010_sd.filter_na_snps() d2010_sd.convert_2_binary() d2010_sd.filter_maf_snps(0.05) #kinship_2010 = Emma.calcKinship(d2010_sd.getSnps(0.05)) d2010_sd = d2010_sd.get_region_snpsd(5, 3140000, 3220000) d2010_sd.remove_redundant_snps(w_missing=True) d250k_file = "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_data_t43_081009.csv" snpsd = dataParsers.parse_snp_data(d250k_file) snpsd.filter_accessions(seq_snpsd.accessions) snpsd.convert_2_binary() snpsd.filter_maf_snps(0.05) #kinship_250k = Emma.calcKinship(snpsd.getSnps(0.02)) snpsd = snpsd.get_region_snpsd(5, 3140000, 3220000) snpsd.remove_redundant_snps() seq_snpsd.remove_accessions(snpsd.accessions) seq_snpsd.snpsFilterRare(0.05) seq_snpsd.onlyBinarySnps() acc_map = [] for i, acc in enumerate(seq_snpsd.accessions): acc_map.append((i, snpsd.accessions.index(acc))) seq_snpsd.orderAccessions(acc_map) seq_snpsd.remove_redundant_snps(w_missing=True) #snpsd.mergeDataUnion(d2010_sd,priority=2,unionType=3) #ad.compare_with_snps_data(snpsd) #Something missing here snpsd...? #i_snpsd = #snpsd.mergeDataUnion(d250k_sd,unionType=3,verbose=True) #NOW PERFORM GWAS AND PLOT RESULT!!!! phend = phenotypeData.readPhenotypeFile( "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv" ) #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv") results_colors = ['blue', 'green', 'red'] #kinship_matrices = [kinship_250k,kinship_250k,kinship_2010] snpsds = [snpsd, seq_snpsd, d2010_sd] phenotypeIndices = phend.phenIds log_transforms = [1, 2] import analyzePhenotype as ap import analyzeSNPResult as asr import copy # for i in phenotypeIndices: # #ap.drawHistogram(phend,i,pdfFile="/Users/bjarnivilhjalmsson/tmp/hist_"+str(phend.getPhenotypeName(i))+".pdf") # #if i in log_transforms: # phend.logTransform(i) # #print "log transforming" # results = [] # filtered_sds=[] # for sd,k in zip(snpsds,kinship_matrices): # new_sd = copy.deepcopy(sd) # res = Emma.run_emma_w_missing_data(new_sd,phend,i,5,k) # res.negLogTransform() # snps_indices_to_keep = res.filterMARF(minMaf=0.1) # print "Got",len(res.scores),len(res.positions),"p-values from Emma." # results.append(res) # #pvals = res.scores # #positions = res.positions # #pp = zip(pvals,positions) # #pp.sort() # #print pp # #import plotResults as pr # #pr.plotResult(res,"/Users/bjarnivilhjalmsson/tmp/test.pdf") # new_sd.filter_snp_indices(snps_indices_to_keep) # filtered_sds.append(new_sd) # import regionPlotter as rp # reg_plotter = rp.RegionPlotter() # reg_plotter.plot_small_result(results,results_colors=results_colors, # pdf_file="/Users/bjarnivilhjalmsson/tmp/seqences_250k_"+result_id+"_emma_gwas_"+str(phend.getPhenotypeName(i))+".pdf") # for j,(r,sd) in enumerate(zip(results,filtered_sds)): # r_i = r.scores.index(max(r.scores)) # phend.plot_marker_box_plot(i,sd,r_i,pdf_file="/Users/bjarnivilhjalmsson/tmp/box_plot_emma_"+str(phend.getPhenotypeName(i))+"_"+results_colors[j]+".pdf",marker_score=r.scores[r_i]) # phend = phenotypeData.readPhenotypeFile( "/Users/bjarnivilhjalmsson/Projects/Data/phenotypes/FLC_phenotypes_011710.tsv" ) #phenotype_parsers.load_phentoype_file("/Users/bjarnivilhjalmsson/Projects/FLC_analysis/data_102509/FLC_soil_data_102509.csv") for i in phenotypeIndices: results = [] filtered_sds = [] for sd in snpsds: new_sd = copy.deepcopy(sd) res, f_sd = KW.run_kw(new_sd, phend, i, 5) filtered_sds.append(f_sd) res.negLogTransform() print "Got", len(res.scores), len( res.positions), "p-values from KW." results.append(res) #pvals = res.scores #positions = res.positions #pp = zip(pvals,positions) #pp.sort() #print pp #import plotResults as pr #pr.plotResult(res,"/Users/bjarnivilhjalmsson/tmp/test.pdf") import regionPlotter as rp reg_plotter = rp.RegionPlotter() reg_plotter.plot_small_result( results, results_colors=results_colors, pdf_file="/Users/bjarnivilhjalmsson/tmp/seqences_250k_" + result_id + "_gwas_" + str(phend.getPhenotypeName(i)) + ".pdf") for j, (r, sd) in enumerate(zip(results, filtered_sds)): if len(r.scores) != len(sd.snps): print "Lengths not equal? %d, %d", (len(r.scores), len(sd.snps)) r_i = r.scores.index(max(r.scores)) phend.plot_marker_box_plot( i, sd, r_i, pdf_file="/Users/bjarnivilhjalmsson/tmp/box_plot_kw_" + str(phend.getPhenotypeName(i)) + "_" + results_colors[j] + ".pdf", marker_score=r.scores[r_i])
def plot_local_tree(): data_dir = "/Users/bjarnivilhjalmsson/Projects/FLC_analysis/" accs_to_keep = _read_tree_accession_file_() ref_seq_name = "raw_ref_col-0" ref_start = 3170501 ref_end = 3183000 ref_chr = 5 intron_start = 3175600 intron_stop = 3179100 #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_edited_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) ad = sequences.readFastaAlignment(data_dir + "flc_seqs_aln_merged_011810.fasta", ref_seq_name=ref_seq_name, ref_start=ref_start, ref_chr=ref_chr, alignment_type="muscle", ref_direction=1) #ref_seq_name = "ref_2_Col-0" #ref_start = 3170001 #ref_end = 3184000 #ref_chr = 5 #ad_2010 = sequences.readFastaAlignment(data_dir+"FLC_full_merged.aln.fasta",ref_seq_name=ref_seq_name,ref_start=ref_start, # ref_chr=ref_chr,alignment_type="muscle",ref_direction=1) #r = ad_2010.get_snps(type=1,min_called_fraction=0.1) #seq_sd = r['snpsd'] #Raw SNPSs data #seq_sd.id = "Sequences" #seq_sd.remove_accessions(accs_to_keep,True) #seq_sd.filterMonoMorphicSnps() #print seq_sd.snps #snpsd = seq_sd.getSnpsData(missingVal='NA') r = ad.get_snps(type=1) seq_snpsd = r['snpsd'] seq_snpsd.remove_accessions(accs_to_keep, True) seq_snpsd.filterMonoMorphicSnps() print len(seq_snpsd.snps) #i_snpsd = r['indels'] #TREE and HAPLOTYPES import analyzeHaplotype as ah start_stop_list = [(3170500, 3183000), (3172000, 3181000), (3172000, 3175000), (3175000, 3178000), (3178000, 3181000), (3176000, 3181000), (intron_start, intron_stop)] for start, stop in start_stop_list: snpsd = seq_snpsd.get_region_snpsd(start, stop) tree_file = "/Users/bjarnivilhjalmsson/tmp/aln_tree_" + str( start) + "_" + str(stop) + ".pdf" ah.plot_tree(snpsd, tree_file, verbose=False) #250K d250k_file = "/Users/bjarnivilhjalmsson/Projects/Data/250k/250K_t43_192.csv" d250k = dataParsers.parse_snp_data(d250k_file) temp_d250k = snpsdata.RawSnpsData(snps=d250k.getSnps(0.05), accessions=d250k.accessions) tree_file = "/Users/bjarnivilhjalmsson/tmp/250k_full_data_tree.pdf" ah.plot_tree(temp_d250k, tree_file, verbose=True) d250k_sd = d250k.get_region_snpsd(5, 3140000, 3220000) #d250k_sd = dataParsers.parse_snp_data_region(d250k_file,ref_chr,3140000,3220000,id="250K_data") start_stop_list = [(3140000, 3220000), (3150000, 3210000), (3170501, 3183000), (3172000, 3181000), (3172000, 3175000), (3175000, 3178000), (3178000, 3181000), (3176000, 3181000), (intron_start, intron_stop)] for start, stop in start_stop_list: snpsd = d250k_sd.get_region_snpsd(start, stop) tree_file = "/Users/bjarnivilhjalmsson/tmp/250k_tree_" + str( start) + "_" + str(stop) + ".pdf" ah.plot_tree(snpsd, tree_file, verbose=False) seq_snpsd.mergeDataUnion(d250k_sd, unionType=1, verbose=True) #2010 d2010_file = "/Users/bjarnivilhjalmsson/Projects/Data/2010/2010_073009.csv" d2010_sd = dataParsers.parse_snp_data_region(d2010_file, ref_chr, 3140000, 3220000, id="2010_data") d2010_sd.filterMissingSnps(50) d2010_sd._convert_to_tg_ecotypes_() d2010_sd.mergeDataUnion(d250k_sd, unionType=1, verbose=True) d250k_sd.remove_accessions(accs_to_keep, True) for start, stop in start_stop_list: snpsd = d250k_sd.get_region_snpsd(start, stop) tree_file = "/Users/bjarnivilhjalmsson/tmp/250k_filtered_tree_" + str( start) + "_" + str(stop) + ".pdf" ah.plot_tree(snpsd, tree_file, verbose=False) d250k = dataParsers.parse_snp_data(d250k_file) d250k.filter_accessions(accs_to_keep, True) d250k.filter_monomorphic_snps() snps = d250k.getSnps(0.05) temp_d250k = snpsdata.RawSnpsData(snps=snps, accessions=d250k.accessions) tree_file = "/Users/bjarnivilhjalmsson/tmp/250k_full_data_filtered_tree.pdf" ah.plot_tree(temp_d250k, tree_file, verbose=False) #Perlegen perlegen_file = "/Users/bjarnivilhjalmsson/Projects/Data/perlegen/perlegen_073009.csv" perlegen_sd = dataParsers.parse_snp_data_region(perlegen_file, ref_chr, 3140000, 3220000, id="perlegen_data") perlegen_sd._convert_to_tg_ecotypes_() perlegen_sd.filterMissingSnps(10) d2010_sd.mergeDataUnion(perlegen_sd, priority=2, unionType=1, verbose=True) seq_snpsd.mergeDataUnion(perlegen_sd, priority=2, unionType=1, verbose=True) #250K, 2010, Perlegen TREE d2010_sd.filter_accessions_by_NAs(0.9) d2010_sd.filterMissingSnps(180) d2010_sd.filterMonoMorphicSnps() for start, stop in start_stop_list: snpsd = d250k_sd.get_region_snpsd(start, stop) tree_file = "/Users/bjarnivilhjalmsson/tmp/250k_2010_perlegen_tree_" + str( start) + "_" + str(stop) + ".pdf" ah.plot_tree(snpsd, tree_file, verbose=False) #250K, 2010, Sequences, Perlegen TREE seq_snpsd.filterMonoMorphicSnps() seq_snpsd.filter_accessions_by_NAs(0.9) seq_snpsd.filterMissingSnps(180) start_stop_list = [(3170500, 3183000), (3172000, 3181000), (3172000, 3175000), (3175000, 3178000), (3178000, 3181000), (3176000, 3181000), (intron_start, intron_stop)] for start, stop in start_stop_list: snpsd = seq_snpsd.get_region_snpsd(start, stop) tree_file = "/Users/bjarnivilhjalmsson/tmp/Seq_250k_2010_perlegen_tree_" + str( start) + "_" + str(stop) + ".pdf" ah.plot_tree(snpsd, tree_file, verbose=False)
def map_phenotype(p_i, phed, snps_data_file, mapping_method, trans_method, p_dict): phenotype_name = phed.getPhenotypeName(p_i) phen_is_binary = phed.isBinary(p_i) file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.getPhenotypeName(p_i), mapping_method, trans_method, p_dict['remove_outliers']) result_name = "%s_%s_%s" % (phenotype_name, mapping_method, trans_method) res = None sd = dataParsers.parse_snp_data(snps_data_file , format=p_dict['data_format'], filter=p_dict['debug_filter']) num_outliers = gwa.prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers']) if p_dict['remove_outliers']: assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA." phen_vals = phed.getPhenVals(p_i) snps = sd.getSnps() if mapping_method in ['emmax']: #Load genotype file (in binary format) sys.stdout.write("Retrieving the Kinship matrix K.\n") sys.stdout.flush() k_file = env['data_dir'] + "kinship_matrix_cm" + str(p_dict['call_method_id']) + ".pickled" kinship_file = p_dict['kinship_file'] if not kinship_file and os.path.isfile(k_file): #Check if corresponding call_method_file is available kinship_file = k_file if kinship_file: #Kinship file was somehow supplied.. print 'Loading supplied kinship' k = lm.load_kinship_from_file(kinship_file, sd.accessions) else: print "No kinship file was found. Generating kinship file:", k_file sd = dataParsers.parse_snp_data(snps_data_file , format=p_dict['data_format']) snps = sd.getSnps() k_accessions = sd.accessions[:] if p_dict['debug_filter']: import random snps = random.sample(snps, int(p_dict['debug_filter'] * len(snps))) k = lm.calc_kinship(snps) f = open(k_file, 'w') cPickle.dump([k, sd.accessions], f) f.close() num_outliers = gwa.prepare_data(sd, phed, p_i, trans_method, p_dict['remove_outliers']) k = lm.filter_k_for_accessions(k, k_accessions, sd.accessions) sys.stdout.flush() sys.stdout.write("Done!\n") if p_dict['remove_outliers']: assert num_outliers != 0, "No outliers were removed, so it makes no sense to go on and perform GWA." #Check whether result already exists. if p_dict['use_existing_results']: print "\nChecking for existing results." result_file = file_prefix + ".pvals" if os.path.isfile(result_file): res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps) pvals = True else: result_file = file_prefix + ".scores" if os.path.isfile(result_file): res = gwaResults.Result(result_file=result_file, name=result_name, snps=snps) pvals = False if res: print "Found existing results.. (%s)" % (result_file) sys.stdout.flush() if not res: #If results weren't found in a file... then do GWA. sys.stdout.write("Finished loading and handling data!\n") print "FIRST STEP: Applying %s to data. " % (mapping_method) sys.stdout.flush() kwargs = {} additional_columns = [] if mapping_method in ['emmax']: res = lm.emmax(snps, phen_vals, k) elif mapping_method in ['lm']: res = lm.linear_model(snps, phen_vals) else: print "Mapping method", mapping_method, 'was not found.' sys.exit(2) if mapping_method in ['lm', 'emmax']: kwargs['genotype_var_perc'] = res['var_perc'] betas = map(list, zip(*res['betas'])) kwargs['beta0'] = betas[0] kwargs['beta1'] = betas[1] additional_columns.append('genotype_var_perc') additional_columns.append('beta0') additional_columns.append('beta1') pvals = res['ps'] sys.stdout.write("Done!\n") sys.stdout.flush() kwargs['correlations'] = calc_correlations(snps, phen_vals) additional_columns.append('correlations') res = gwaResults.Result(scores=pvals, snps_data=sd, name=result_name, **kwargs) if mapping_method in ["emmax", 'lm']: result_file = file_prefix + ".pvals" else: result_file = file_prefix + ".scores" res.write_to_file(result_file, additional_columns) print "Generating a GW plot." sys.stdout.flush() png_file = file_prefix + "_gwa_plot.png" #png_file_max30 = file_prefix+"_gwa_plot_max30.png" if mapping_method in ['lm', "emmax"]: res.neg_log_trans() if mapping_method in ["kw", "ft"]:# or p_dict['data_format'] != 'binary': #res.plot_manhattan(png_file=png_file_max30,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", # plot_bonferroni=True,max_score=30) res.plot_manhattan(png_file=png_file, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$", plot_bonferroni=True) else: if res.filter_attr("mafs", p_dict['mac_threshold']) > 0: #res.plot_manhattan(png_file=png_file_max30,percentile=90,type="pvals",ylab="$-$log$_{10}(p)$", # plot_bonferroni=True,max_score=30) res.plot_manhattan(png_file=png_file, percentile=90, type="pvals", ylab="$-$log$_{10}(p)$", plot_bonferroni=True) else: pass print "plotting histogram" hist_file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phenotype_name, trans_method, p_dict['remove_outliers']) hist_png_file = hist_file_prefix + "_hist.png" phed.plot_histogram(p_i, pngFile=hist_png_file) else: res.neg_log_trans() assert res.filter_attr("mafs", p_dict['mac_threshold']), 'All SNPs have MAC smaller than threshold' print "SECOND STEP:" res.filter_top_snps(p_dict['second_step_number']) snps = res.snps positions = res.positions chromosomes = res.chromosomes #Checking res_file exists file_prefix = _get_file_prefix_(p_dict['run_id'], p_i, phed.getPhenotypeName(p_i), mapping_method, trans_method, p_dict['remove_outliers'], p_dict['second_step_number']) res_file = file_prefix + '_res.cpickled' if p_dict['use_existing_results'] and os.path.isfile(res_file): print 'Found existing results for the second step... loading.' f = open(res_file, 'rb') second_res = cPickle.load(f) f.close() else: if mapping_method == 'lm': second_res = lm.linear_model_two_snps(snps, phen_vals) if mapping_method == 'emmax': second_res = lm.emmax_two_snps(snps, phen_vals, k) #Pickling results.. print 'Saving results as pickled file:', res_file f = open(res_file, 'wb') cPickle.dump(second_res, f, protocol=2) f.close() #Plotting second step plots: score_array = -sp.log10(second_res['ps']) p3_score_array = -sp.log10(second_res['p3_ps']) p4_score_array = -sp.log10(second_res['p4_ps']) import plotResults as pr pr.plot_snp_pair_result(chromosomes, positions, score_array, file_prefix + '_scatter') pr.plot_snp_pair_result(chromosomes, positions, p3_score_array, file_prefix + '_p3_scatter') pr.plot_snp_pair_result(chromosomes, positions, p4_score_array, file_prefix + '_p4_scatter') if p_dict['region_plots']: import regionPlotter as rp regions_results = res.get_top_region_results(p_dict['region_plots']) plotter = rp.RegionPlotter() print "Starting region plots..." for reg_res in regions_results: chromosome = reg_res.chromosomes[0] caption = phenotype_name + "_c" + str(chromosome) + "_" + mapping_method png_file = file_prefix + "_reg_plot_c" + str(chromosome) + "_s" + str(reg_res.positions[0]) \ + "_e" + str(reg_res.positions[-1]) + ".png" tair_file = file_prefix + "_reg_plot_c" + str(chromosome) + "_s" + str(reg_res.positions[0]) \ + "_e" + str(reg_res.positions[-1]) + "_tair_info.txt" plotter.plot_small_result([reg_res], png_file=png_file, highlight_gene_ids=tair_ids, caption=caption, tair_file=tair_file) #Plot Box-plot png_file = file_prefix + "_reg_plot_c" + str(chromosome) + "_s" + str(reg_res.positions[0]) \ + "_e" + str(reg_res.positions[-1]) + "_box_plot.png" (marker, score, chromosome, pos) = reg_res.get_max_snp() marker_accessions = sd.accessions phed.plot_marker_box_plot(p_i, marker=marker, marker_accessions=marker_accessions, \ png_file=png_file, title="c" + str(chromosome) + "_p" + str(pos), \ marker_score=score, marker_missing_val=sd.missing_val)