Example #1
0
def calc_tree(alignment):
    a = egglib.Align(alignment)
    a = a.extract(0, len(a.sequence(1)) - 3) # remove stop codon
    if len(a.sequence(1))%3 != 0:
        print(alignment, " not in frame")
        return (a, None)
    if a.ns() < 3:
        return (a, None)
    for i in range(a.ns()):
        a.sequence(i, sequence=a.sequence(i).upper())
    tree, loglk = wrappers.phyml(a)
    return (a, tree)
Example #2
0
def withinGeneLD(directory):
    outfile = open("LDstats.txt", "w")
    outfile.write("Gene\tPosition1\tPosition2\tDistance\tD'\tR2\n")
    for f in listdir_fullpath(directory):
        a = egglib.Align(f)
        gene = os.path.basename(f).split("_")[0]
        siteIndices = a.polymorphism()['siteIndices']
        ldStats = a.matrixLD()
        for i in range(len(siteIndices)):
            for j in range(i + 1, len(siteIndices)):
                pos1 = siteIndices[i]
                pos2 = siteIndices[j]
                distance = ldStats['d'][pos1][pos2]
                dPrime = ldStats['Dp'][pos1][pos2]
                rSquared = ldStats['r2'][pos1][pos2]
                outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(
                    gene, pos1, pos2, distance, dPrime, rSquared))
    outfile.close()
Example #3
0
File: pNpS.py Project: hui-liu/pNpS
def get_seqid_str(ref_genome, chrom):

    """ Create the str to give to the limit parameter in the
    gffutils """

    seqid_str = None

    seq = egglib.Align()

    with egglib.io.fasta_iter(ref_genome) as f:
        for item in f:
            if item.name == chrom:
                seq.add_sample(item.name, item.sequence.str())
                length = item.ls
                seqid_str = chrom + ':1-%s' % length
                break

    return seqid_str, seq
def calc_pi(alignment, sequence):
    piDict = {}
    piList = []
    a = egglib.Align(alignment)
    for i in range(a.ns()):
        a.sequence(i, sequence=a.sequence(i).upper())
    seqIndex = a.find(sequence, strict=False)
    for i in range(a.ns()):
        if i != seqIndex:
            tempAlign = egglib.Align.create([a[seqIndex], a[i]])
            polyDict = tempAlign.polymorphism()
            piDict[a.name(i)] = polyDict['Pi']
            piList.append(float(polyDict['Pi']))
    minPi = min(piList)
    strainList = []
    for strain in piDict:
        if float(piDict[strain]) == minPi:
            strainList.append(strain)
    return strainList
Example #5
0
def calc_stats(alignment, outgroup):
    statDict = {}
    a = egglib.Align(alignment)
    for i in range(a.ns()):
        a.sequence(i, sequence=a.sequence(i).upper())
        if args.frame:
            a.sequence(i, sequence=replace_stop(a.sequence(i)))
    polyDict = a.polymorphism()
    statDict['theta'] = polyDict['thetaW']
    statDict['pi'] = polyDict['Pi']
    statDict['tajimaD'] = polyDict['D']
    if args.frame:
        if len(a.sequence(1)) % 3 != 0:
            print("The following alignment is not in frame:")
            print(alignment)
            return {}
        polyDictBPP = a.polymorphismBPP(dataType=4)
        statDict['piN'] = polyDictBPP['PiNS']
        statDict['piS'] = polyDictBPP['PiS']
        statDict['NS'] = polyDictBPP['NSsites']
        statDict['S'] = polyDictBPP['Ssites']
    if outgroup is not None:
        for o in outgroup:
            temp_a = a.extract(0, len(a.sequence(1)) - 3)  # remove stop codon
            otherOutgroups = outgroup[:]
            otherOutgroups.remove(o)
            for otherOutgroup in otherOutgroups:
                index = temp_a.find(otherOutgroup, strict=False)
                if index is not None:
                    del temp_a[index]
            try:
                temp_a.group(temp_a.find(o, strict=False), group=999)
            except IndexError:
                print("The following outgroup is not present in alignment")
                print(o, a.find(o, strict=False))
                sys.exit()
            polyDictBPP = temp_a.polymorphismBPP(dataType=4)
            statDict['MK_' + o] = polyDictBPP['MK']
            statDict['NI_' + o] = polyDictBPP['NI']
    return statDict
Example #6
0
# GP2=metal

print "SCAFFOLD\tPAIR\tSTART\tEND\tDxy\tDa\tPi1\tPi2"

scafs = [
    "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14",
    "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26",
    "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38",
    "39", "40", "41", "42", "43", "44", "45", "46", "47"
]
#listed by doing ls HiQ_Sb_woNsSin_woSb32Sb7_indivscafs_fasta_concat/ | while read f; do g=`echo $f|sed 's/HiQ_Sb_woNsSin_woSb32Sb7_scaffold_//'| sed 's/.fasta//'`; echo -n $g"\",\""; done

#for fasta in os.listdir(path+"/"+scaffolds):
for scaf in scafs:
    fasta = fastas + scaf + ".fasta"
    data = egglib.Align(path + fasta, groups=False)
    popdata = {}
    for pop in pop_ids:
        popdata[pop] = []
    for seq in data:
        seq.name = seq.name.replace(scaf + '_', '')
        for pop in pop_ids:
            if seq.name in pops[pop]:
                popdata[pop].append(seq)
    for pair in itertools.combinations(pop_ids, 2):
        if sorted(pair) not in [
                sorted(['WAca', 'CAWAca']),
                sorted(['WAnm', 'NMWAnm']),
                sorted(['AKEU', 'AK'])
        ]:  #*****CHECK THIS!!!!
            align1 = egglib.Align.create(popdata[pair[0]])
Example #7
0
GP2 = [
    "502931_1151576", "502931_1151572", "502931_1143066", "502931_1143067",
    "502931_1143068", "502931_1151574", "502931_1143052", "502931_1143050",
    "502931_1143051", "502931_1143078", "502931_1151580", "502931_1151581",
    "502931_1151582", "502931_1143070", "502931_1143073", "502931_1151559",
    "502931_1151560", "502931_1151561", "502931_1151562", "502931_1151564"
]
pops = {'GP1': GP1, 'GP2': GP2}
pop_ids = ['GP1', 'GP2']
print "scaffold\twindow\tgroup\tS\tthetaW\tPi\tD\tHe\tK"

dataset = dir.replace("concatscafs_fasta", "")  #

for file in os.listdir(path + dir):
    #  	print file
    data = egglib.Align(path + dir + "/" + file, groups=False)
    popdata = {}
    for pop in pop_ids:
        popdata[pop] = []
    scaf = file.replace(dataset + "_scaffold_", "")
    scaf = scaf.replace(".fasta", "")
    print scaf
    for seq in data:
        seq.name = (seq.name).replace("scaffold_" + scaf + "_", "")
        # 		print seq.name
        for pop in pop_ids:
            if seq.name in pops[pop]:
                popdata[pop].append(seq)

# 	for name,seq in popdata['GP1']:
# 		print name,seq
def step(ext, dirname, names):
    ext = ext.lower()

    # this will make sure the extension is lowercase
    for name in names:
        if name.lower().endswith(ext):
            my_align = egglib.io.from_fasta(name)

            ## index corresponds to the position of each sample in the list -1 (Python!!)
            brau_list = [0, 47]
            chlo_list = [1, 2]
            clar_list = [3, 4]
            defi_list = [5, 6, 39]
            der_list = [7]
            irano_list = [69, 70]
            mix1_list = [9, 12, 13, 14]
            mix2_list = [10, 11]
            par_list = [17, 41, 66]
            por1_list = [19, 21]
            por2_list = [18, 22, 23]
            praB_list = [25, 28]
            praC_list = [24, 26, 27]
            rad1_list = [15, 16, 20, 29, 30, 31, 32, 33, 35, 36, 37]
            rad2_list = [34, 38, 40]
            rud1_list = [45, 46, 56]
            rud2_list = [42, 43, 44, 68]
            sax_list = [8, 48]
            stei_list = [49, 50, 51]
            val_list = [52, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 53, 54]

            # to make a new file I will create a new alignment
            #rand_align = egglib.Align.create(align)
            rand_align = egglib.Align()

            # group_of_items = {1, 2, 3, 4}  # a sequence or set will work here.
            num_of_inds_per_sps = 2  # set the number to select here.
            #list_of_random_items = random.sample(group_of_items, num_of_inds_per_sps)
            #  first_random_item = list_of_random_items[0]
            #  second_random_item = list_of_random_items[1]

            rand_brau = random.sample(brau_list, num_of_inds_per_sps)
            rand_chlo = random.sample(chlo_list, num_of_inds_per_sps)
            rand_clar = random.sample(clar_list, num_of_inds_per_sps)
            rand_defi = random.sample(defi_list, num_of_inds_per_sps)
            rand_der = der_list
            rand_irano = random.sample(irano_list, num_of_inds_per_sps)
            rand_mix1 = random.sample(mix1_list, num_of_inds_per_sps)
            rand_mix2 = random.sample(mix2_list, num_of_inds_per_sps)
            rand_par = random.sample(par_list, num_of_inds_per_sps)
            rand_par12 = par12_list
            rand_por1 = random.sample(por1_list, num_of_inds_per_sps)
            rand_por2 = random.sample(por2_list, num_of_inds_per_sps)
            rand_praB = random.sample(praB_list, num_of_inds_per_sps)
            rand_praC = random.sample(praC_list, num_of_inds_per_sps)
            rand_rad1 = random.sample(rad1_list, num_of_inds_per_sps)
            rand_rad2 = random.sample(rad2_list, num_of_inds_per_sps)
            rand_rud1 = random.sample(rud1_list, num_of_inds_per_sps)
            rand_rud2 = random.sample(rud2_list, num_of_inds_per_sps)
            rand_sax = random.sample(sax_list, num_of_inds_per_sps)
            rand_stei = random.sample(stei_list, num_of_inds_per_sps)
            rand_val = random.sample(val_list, num_of_inds_per_sps)

            brau = my_align.subset(rand_brau)
            chlo = my_align.subset(rand_chlo)
            clar = my_align.subset(rand_clar)
            defi = my_align.subset(rand_defi)
            der = my_align.subset(rand_der)
            irano = my_align.subset(rand_irano)
            mix1 = my_align.subset(rand_mix1)
            mix2 = my_align.subset(rand_mix2)
            par = my_align.subset(rand_par)
            par12 = my_align.subset(rand_par12)
            por1 = my_align.subset(rand_por1)
            por2 = my_align.subset(rand_por2)
            praB = my_align.subset(rand_praB)
            praC = my_align.subset(rand_praC)
            rad1 = my_align.subset(rand_rad1)
            rad2 = my_align.subset(rand_rad2)
            rud1 = my_align.subset(rand_rud1)
            rud2 = my_align.subset(rand_rud2)
            sax = my_align.subset(rand_sax)
            stei = my_align.subset(rand_stei)
            val = my_align.subset(rand_val)

            # add on the alignment the random sequences
            rand_align.add_samples(brau)
            rand_align.add_samples(chlo)
            rand_align.add_samples(clar)
            rand_align.add_samples(defi)
            rand_align.add_samples(der)
            rand_align.add_samples(mix1)
            rand_align.add_samples(mix2)
            rand_align.add_samples(par)
            rand_align.add_samples(par12)
            rand_align.add_samples(por1)
            rand_align.add_samples(por2)
            rand_align.add_samples(praB)
            rand_align.add_samples(praC)
            rand_align.add_samples(rad1)
            rand_align.add_samples(rad2)
            rand_align.add_samples(rud1)
            rand_align.add_samples(rud2)
            rand_align.add_samples(sax)
            rand_align.add_samples(stei)
            rand_align.add_samples(val)
            rand_align.add_samples(irano)
            # now the brau subset has all sequences in the order I want
            rand_align.to_fasta(fname=name + '.fa')
            nex = rand_align.nexus()
            nexfile = open(name + '.nexus', 'w')
            nexfile.write(nex)
Example #9
0
    statDict['theta'] = polyDict['thetaW']
    statDict['pi'] = polyDict['Pi']
    statDict['tajimaD'] = polyDict['D']
    statDict['FayWuH'] = polyDict['H']
    return statDict


alignment, winWidth, winStep, outgroup = get_arguments(sys.argv[1:])

if alignment is None:
    usage()
    sys.exit()

outfile = open('windowStats_' + os.path.splitext(alignment)[0] + '.txt', 'w')
outfile.write("Start\tStop\tTheta\tPi\tTajimasD\tFay&WuH\n")
align = egglib.Align(alignment)
for i in range(align.ns()):
    align.sequence(i, sequence=align.sequence(i).upper())
if outgroup is not None:
    align.group(align.find(outgroup, strict=False), group=999)
start = 0
stop = winWidth

location = []
TD = []

for window in align.slider(winWidth, winStep):
    stats = calc_stats(window)
    start += winStep
    stop += winStep
    outfile.write("%i\t%i\t%s\t%s\t%s\t%s\n" %
Example #10
0
File: pNpS.py Project: hui-liu/pNpS
def extract_cds_align(vcf, min_dp, max_dp, sample_list, gff, gene_id,  cds_dict, filtered=True):

    """
    Iterator that goes site by site through vcf gene region and returns
    a CDS align and positions of sites in that align
    Returns:
        object: list
    """

    gene_cds_aligns = []

    #for transcript in cds_dict[gene_id]:
    transcript = cds_dict[gene_id].keys()[0]
    cds_pos_list = []

    for coords in cds_dict[gene_id][transcript]:
        if type(coords) is tuple:
            cds_pos_list += range(coords[0], coords[1] + 1)

    len_bp = len(cds_pos_list)
    cds_align = egglib.Align(nsit=len_bp)

    for sample in sample_list:

        cds_align.add_sample(sample + '_1', data='N'*len_bp)
        cds_align.add_sample(sample + '_2', data='N'*len_bp)

    align_pos = -1
    indel_end = 0

    gene_region = extract_gene(vcf, gff, gene_id)

    for site in gene_region:

        if site.POS not in cds_pos_list:  # only extract genic sites in CDS blocks
            continue

        align_pos += 1

        if site.REF == 'N':
            # ref_N += 1
            continue

        if site.is_indel and site.aaf != 0.0:
            if len(site.REF) > len(site.ALT):
                indel_end = site.POS + len(site.REF) - 1
                continue
            # indels += 1
            else:
                continue

        if len(site.ALT) >= 1 and site.ALT[-1] == '*':  # SNPs at spanning deletion
            # spanning_deletion += 1
            continue

        all_dp = [x['DP'] for x in site.samples]  # get the genotype depths

        if None in all_dp:  # Exclude sites where a genotype DP field is set to '.'
            # low_call_rate += 1
            continue

        mean_dp = np.mean(all_dp)

        if mean_dp < min_dp or mean_dp > max_dp:  # depth filter
            # extreme_depth += 1
            continue

        if 0 in all_dp or site.call_rate < 1.0:  # only consider sites where all samples have coverage
            # low_call_rate += 1
            continue

        if site.FILTER is not None:
            if site.FILTER == "REPEAT" or "REPEAT" in site.FILTER:  # exclude sites in repeat regions
                # repeat_sites += 1
                continue

        if site.is_monomorphic:
            if site.POS > indel_end:
                call_to_bases(site, align_pos, cds_align, sample_list)
                indel_end = 0
                continue
            else:
                continue

        if site.is_indel and site.aaf == 0.0:
            if site.POS > indel_end:
                call_to_bases(site, align_pos, cds_align, sample_list)
                indel_end = 0
                # valid_sites += 1
                continue
            else:
                continue

        if site.is_snp:

            if len(site.ALT) > 1:
                # multiallelic_snp += 1
                continue

            if site.aaf == 1.0 or site.aaf == 0.0:  # only want SNPs polymorphic in our sample
                if site.POS > indel_end:
                    call_to_bases(site, align_pos, cds_align, sample_list)
                    continue
                else:
                    continue

            if filtered:
                if site.FILTER == 'PASS' and site.POS > indel_end:
                    call_to_bases(site, align_pos, cds_align, sample_list)
                    continue
                else:
                    # failed_snp += 1
                    continue
            else:
                if site.POS > indel_end:
                    call_to_bases(site, align_pos, cds_align, sample_list)
                else:
                    continue
        else:
            problem_site = site.CHROM + '\t' + str(site.POS)
            error_message = 'Could not assign ' + problem_site + ' to site type'
            sys.exit(error_message)

    if cds_dict[gene_id][transcript][-2] == '-':  # reverse-complement when on '-' strand
        cds_align = rc_align(cds_align)
        cds_pos_list.reverse()

    cds_align_nostop = cds_align.extract(0, cds_align.ls - 3)  # drop the stop codon at the end

    gene_cds_aligns.append((cds_align_nostop, cds_pos_list[0:-3]))

    return cds_align_nostop, cds_pos_list[0:-3]