def calc_tree(alignment): a = egglib.Align(alignment) a = a.extract(0, len(a.sequence(1)) - 3) # remove stop codon if len(a.sequence(1))%3 != 0: print(alignment, " not in frame") return (a, None) if a.ns() < 3: return (a, None) for i in range(a.ns()): a.sequence(i, sequence=a.sequence(i).upper()) tree, loglk = wrappers.phyml(a) return (a, tree)
def withinGeneLD(directory): outfile = open("LDstats.txt", "w") outfile.write("Gene\tPosition1\tPosition2\tDistance\tD'\tR2\n") for f in listdir_fullpath(directory): a = egglib.Align(f) gene = os.path.basename(f).split("_")[0] siteIndices = a.polymorphism()['siteIndices'] ldStats = a.matrixLD() for i in range(len(siteIndices)): for j in range(i + 1, len(siteIndices)): pos1 = siteIndices[i] pos2 = siteIndices[j] distance = ldStats['d'][pos1][pos2] dPrime = ldStats['Dp'][pos1][pos2] rSquared = ldStats['r2'][pos1][pos2] outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format( gene, pos1, pos2, distance, dPrime, rSquared)) outfile.close()
def get_seqid_str(ref_genome, chrom): """ Create the str to give to the limit parameter in the gffutils """ seqid_str = None seq = egglib.Align() with egglib.io.fasta_iter(ref_genome) as f: for item in f: if item.name == chrom: seq.add_sample(item.name, item.sequence.str()) length = item.ls seqid_str = chrom + ':1-%s' % length break return seqid_str, seq
def calc_pi(alignment, sequence): piDict = {} piList = [] a = egglib.Align(alignment) for i in range(a.ns()): a.sequence(i, sequence=a.sequence(i).upper()) seqIndex = a.find(sequence, strict=False) for i in range(a.ns()): if i != seqIndex: tempAlign = egglib.Align.create([a[seqIndex], a[i]]) polyDict = tempAlign.polymorphism() piDict[a.name(i)] = polyDict['Pi'] piList.append(float(polyDict['Pi'])) minPi = min(piList) strainList = [] for strain in piDict: if float(piDict[strain]) == minPi: strainList.append(strain) return strainList
def calc_stats(alignment, outgroup): statDict = {} a = egglib.Align(alignment) for i in range(a.ns()): a.sequence(i, sequence=a.sequence(i).upper()) if args.frame: a.sequence(i, sequence=replace_stop(a.sequence(i))) polyDict = a.polymorphism() statDict['theta'] = polyDict['thetaW'] statDict['pi'] = polyDict['Pi'] statDict['tajimaD'] = polyDict['D'] if args.frame: if len(a.sequence(1)) % 3 != 0: print("The following alignment is not in frame:") print(alignment) return {} polyDictBPP = a.polymorphismBPP(dataType=4) statDict['piN'] = polyDictBPP['PiNS'] statDict['piS'] = polyDictBPP['PiS'] statDict['NS'] = polyDictBPP['NSsites'] statDict['S'] = polyDictBPP['Ssites'] if outgroup is not None: for o in outgroup: temp_a = a.extract(0, len(a.sequence(1)) - 3) # remove stop codon otherOutgroups = outgroup[:] otherOutgroups.remove(o) for otherOutgroup in otherOutgroups: index = temp_a.find(otherOutgroup, strict=False) if index is not None: del temp_a[index] try: temp_a.group(temp_a.find(o, strict=False), group=999) except IndexError: print("The following outgroup is not present in alignment") print(o, a.find(o, strict=False)) sys.exit() polyDictBPP = temp_a.polymorphismBPP(dataType=4) statDict['MK_' + o] = polyDictBPP['MK'] statDict['NI_' + o] = polyDictBPP['NI'] return statDict
# GP2=metal print "SCAFFOLD\tPAIR\tSTART\tEND\tDxy\tDa\tPi1\tPi2" scafs = [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", "47" ] #listed by doing ls HiQ_Sb_woNsSin_woSb32Sb7_indivscafs_fasta_concat/ | while read f; do g=`echo $f|sed 's/HiQ_Sb_woNsSin_woSb32Sb7_scaffold_//'| sed 's/.fasta//'`; echo -n $g"\",\""; done #for fasta in os.listdir(path+"/"+scaffolds): for scaf in scafs: fasta = fastas + scaf + ".fasta" data = egglib.Align(path + fasta, groups=False) popdata = {} for pop in pop_ids: popdata[pop] = [] for seq in data: seq.name = seq.name.replace(scaf + '_', '') for pop in pop_ids: if seq.name in pops[pop]: popdata[pop].append(seq) for pair in itertools.combinations(pop_ids, 2): if sorted(pair) not in [ sorted(['WAca', 'CAWAca']), sorted(['WAnm', 'NMWAnm']), sorted(['AKEU', 'AK']) ]: #*****CHECK THIS!!!! align1 = egglib.Align.create(popdata[pair[0]])
GP2 = [ "502931_1151576", "502931_1151572", "502931_1143066", "502931_1143067", "502931_1143068", "502931_1151574", "502931_1143052", "502931_1143050", "502931_1143051", "502931_1143078", "502931_1151580", "502931_1151581", "502931_1151582", "502931_1143070", "502931_1143073", "502931_1151559", "502931_1151560", "502931_1151561", "502931_1151562", "502931_1151564" ] pops = {'GP1': GP1, 'GP2': GP2} pop_ids = ['GP1', 'GP2'] print "scaffold\twindow\tgroup\tS\tthetaW\tPi\tD\tHe\tK" dataset = dir.replace("concatscafs_fasta", "") # for file in os.listdir(path + dir): # print file data = egglib.Align(path + dir + "/" + file, groups=False) popdata = {} for pop in pop_ids: popdata[pop] = [] scaf = file.replace(dataset + "_scaffold_", "") scaf = scaf.replace(".fasta", "") print scaf for seq in data: seq.name = (seq.name).replace("scaffold_" + scaf + "_", "") # print seq.name for pop in pop_ids: if seq.name in pops[pop]: popdata[pop].append(seq) # for name,seq in popdata['GP1']: # print name,seq
def step(ext, dirname, names): ext = ext.lower() # this will make sure the extension is lowercase for name in names: if name.lower().endswith(ext): my_align = egglib.io.from_fasta(name) ## index corresponds to the position of each sample in the list -1 (Python!!) brau_list = [0, 47] chlo_list = [1, 2] clar_list = [3, 4] defi_list = [5, 6, 39] der_list = [7] irano_list = [69, 70] mix1_list = [9, 12, 13, 14] mix2_list = [10, 11] par_list = [17, 41, 66] por1_list = [19, 21] por2_list = [18, 22, 23] praB_list = [25, 28] praC_list = [24, 26, 27] rad1_list = [15, 16, 20, 29, 30, 31, 32, 33, 35, 36, 37] rad2_list = [34, 38, 40] rud1_list = [45, 46, 56] rud2_list = [42, 43, 44, 68] sax_list = [8, 48] stei_list = [49, 50, 51] val_list = [52, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 53, 54] # to make a new file I will create a new alignment #rand_align = egglib.Align.create(align) rand_align = egglib.Align() # group_of_items = {1, 2, 3, 4} # a sequence or set will work here. num_of_inds_per_sps = 2 # set the number to select here. #list_of_random_items = random.sample(group_of_items, num_of_inds_per_sps) # first_random_item = list_of_random_items[0] # second_random_item = list_of_random_items[1] rand_brau = random.sample(brau_list, num_of_inds_per_sps) rand_chlo = random.sample(chlo_list, num_of_inds_per_sps) rand_clar = random.sample(clar_list, num_of_inds_per_sps) rand_defi = random.sample(defi_list, num_of_inds_per_sps) rand_der = der_list rand_irano = random.sample(irano_list, num_of_inds_per_sps) rand_mix1 = random.sample(mix1_list, num_of_inds_per_sps) rand_mix2 = random.sample(mix2_list, num_of_inds_per_sps) rand_par = random.sample(par_list, num_of_inds_per_sps) rand_par12 = par12_list rand_por1 = random.sample(por1_list, num_of_inds_per_sps) rand_por2 = random.sample(por2_list, num_of_inds_per_sps) rand_praB = random.sample(praB_list, num_of_inds_per_sps) rand_praC = random.sample(praC_list, num_of_inds_per_sps) rand_rad1 = random.sample(rad1_list, num_of_inds_per_sps) rand_rad2 = random.sample(rad2_list, num_of_inds_per_sps) rand_rud1 = random.sample(rud1_list, num_of_inds_per_sps) rand_rud2 = random.sample(rud2_list, num_of_inds_per_sps) rand_sax = random.sample(sax_list, num_of_inds_per_sps) rand_stei = random.sample(stei_list, num_of_inds_per_sps) rand_val = random.sample(val_list, num_of_inds_per_sps) brau = my_align.subset(rand_brau) chlo = my_align.subset(rand_chlo) clar = my_align.subset(rand_clar) defi = my_align.subset(rand_defi) der = my_align.subset(rand_der) irano = my_align.subset(rand_irano) mix1 = my_align.subset(rand_mix1) mix2 = my_align.subset(rand_mix2) par = my_align.subset(rand_par) par12 = my_align.subset(rand_par12) por1 = my_align.subset(rand_por1) por2 = my_align.subset(rand_por2) praB = my_align.subset(rand_praB) praC = my_align.subset(rand_praC) rad1 = my_align.subset(rand_rad1) rad2 = my_align.subset(rand_rad2) rud1 = my_align.subset(rand_rud1) rud2 = my_align.subset(rand_rud2) sax = my_align.subset(rand_sax) stei = my_align.subset(rand_stei) val = my_align.subset(rand_val) # add on the alignment the random sequences rand_align.add_samples(brau) rand_align.add_samples(chlo) rand_align.add_samples(clar) rand_align.add_samples(defi) rand_align.add_samples(der) rand_align.add_samples(mix1) rand_align.add_samples(mix2) rand_align.add_samples(par) rand_align.add_samples(par12) rand_align.add_samples(por1) rand_align.add_samples(por2) rand_align.add_samples(praB) rand_align.add_samples(praC) rand_align.add_samples(rad1) rand_align.add_samples(rad2) rand_align.add_samples(rud1) rand_align.add_samples(rud2) rand_align.add_samples(sax) rand_align.add_samples(stei) rand_align.add_samples(val) rand_align.add_samples(irano) # now the brau subset has all sequences in the order I want rand_align.to_fasta(fname=name + '.fa') nex = rand_align.nexus() nexfile = open(name + '.nexus', 'w') nexfile.write(nex)
statDict['theta'] = polyDict['thetaW'] statDict['pi'] = polyDict['Pi'] statDict['tajimaD'] = polyDict['D'] statDict['FayWuH'] = polyDict['H'] return statDict alignment, winWidth, winStep, outgroup = get_arguments(sys.argv[1:]) if alignment is None: usage() sys.exit() outfile = open('windowStats_' + os.path.splitext(alignment)[0] + '.txt', 'w') outfile.write("Start\tStop\tTheta\tPi\tTajimasD\tFay&WuH\n") align = egglib.Align(alignment) for i in range(align.ns()): align.sequence(i, sequence=align.sequence(i).upper()) if outgroup is not None: align.group(align.find(outgroup, strict=False), group=999) start = 0 stop = winWidth location = [] TD = [] for window in align.slider(winWidth, winStep): stats = calc_stats(window) start += winStep stop += winStep outfile.write("%i\t%i\t%s\t%s\t%s\t%s\n" %
def extract_cds_align(vcf, min_dp, max_dp, sample_list, gff, gene_id, cds_dict, filtered=True): """ Iterator that goes site by site through vcf gene region and returns a CDS align and positions of sites in that align Returns: object: list """ gene_cds_aligns = [] #for transcript in cds_dict[gene_id]: transcript = cds_dict[gene_id].keys()[0] cds_pos_list = [] for coords in cds_dict[gene_id][transcript]: if type(coords) is tuple: cds_pos_list += range(coords[0], coords[1] + 1) len_bp = len(cds_pos_list) cds_align = egglib.Align(nsit=len_bp) for sample in sample_list: cds_align.add_sample(sample + '_1', data='N'*len_bp) cds_align.add_sample(sample + '_2', data='N'*len_bp) align_pos = -1 indel_end = 0 gene_region = extract_gene(vcf, gff, gene_id) for site in gene_region: if site.POS not in cds_pos_list: # only extract genic sites in CDS blocks continue align_pos += 1 if site.REF == 'N': # ref_N += 1 continue if site.is_indel and site.aaf != 0.0: if len(site.REF) > len(site.ALT): indel_end = site.POS + len(site.REF) - 1 continue # indels += 1 else: continue if len(site.ALT) >= 1 and site.ALT[-1] == '*': # SNPs at spanning deletion # spanning_deletion += 1 continue all_dp = [x['DP'] for x in site.samples] # get the genotype depths if None in all_dp: # Exclude sites where a genotype DP field is set to '.' # low_call_rate += 1 continue mean_dp = np.mean(all_dp) if mean_dp < min_dp or mean_dp > max_dp: # depth filter # extreme_depth += 1 continue if 0 in all_dp or site.call_rate < 1.0: # only consider sites where all samples have coverage # low_call_rate += 1 continue if site.FILTER is not None: if site.FILTER == "REPEAT" or "REPEAT" in site.FILTER: # exclude sites in repeat regions # repeat_sites += 1 continue if site.is_monomorphic: if site.POS > indel_end: call_to_bases(site, align_pos, cds_align, sample_list) indel_end = 0 continue else: continue if site.is_indel and site.aaf == 0.0: if site.POS > indel_end: call_to_bases(site, align_pos, cds_align, sample_list) indel_end = 0 # valid_sites += 1 continue else: continue if site.is_snp: if len(site.ALT) > 1: # multiallelic_snp += 1 continue if site.aaf == 1.0 or site.aaf == 0.0: # only want SNPs polymorphic in our sample if site.POS > indel_end: call_to_bases(site, align_pos, cds_align, sample_list) continue else: continue if filtered: if site.FILTER == 'PASS' and site.POS > indel_end: call_to_bases(site, align_pos, cds_align, sample_list) continue else: # failed_snp += 1 continue else: if site.POS > indel_end: call_to_bases(site, align_pos, cds_align, sample_list) else: continue else: problem_site = site.CHROM + '\t' + str(site.POS) error_message = 'Could not assign ' + problem_site + ' to site type' sys.exit(error_message) if cds_dict[gene_id][transcript][-2] == '-': # reverse-complement when on '-' strand cds_align = rc_align(cds_align) cds_pos_list.reverse() cds_align_nostop = cds_align.extract(0, cds_align.ls - 3) # drop the stop codon at the end gene_cds_aligns.append((cds_align_nostop, cds_pos_list[0:-3])) return cds_align_nostop, cds_pos_list[0:-3]