コード例 #1
0
def resampling_f(fastadict, seq, n, k):
    fastadict[seq] = fastadict[seq].replace("N", "").replace("n", "")
    seq_list = np.random.choice(tuple(fastadict[seq]),
                                replace=True,
                                size=(n * k, )).view('S{k}'.format(k=k))
    l = []
    for element in seq_list:
        new_seq_het = list(
            heterozygosity(element))[2]  # Het of the new sequence
        l.append(new_seq_het)
    return l
コード例 #2
0
    return intergenicDict


intergenDict = intergenicCoord(intergenic)

if winSize == 0 and step == 0:
    # Do not use sliding window approach, just calculate heterozygosity in the given region
    new_fastdict = {}
    for scaffold in intergenDict.keys():
        if scaffold in fastaDict.keys():
            for coordinate in intergenDict.get(scaffold):
                intron = fastaDict.get(
                    scaffold)[int(coordinate[0]):int(coordinate[1])]
                print(scaffold + '\t' + str(coordinate[0]) + '\t' +
                      str(coordinate[1]) + '\t' +
                      str(heterozygosity(intron)[0]) + '\t' +
                      str(heterozygosity(intron)[1]) + '\t' +
                      str(heterozygosity(intron)[2]))

# Get a sequence in fasta that lies within the intergenic sequence -- V2
else:
    # Print header
    print("Scaffold" + "\t" + "Orientation" + "\t" + "Window" + "\t" +
          "Window_Start" + "\t" + "Window_End" + "\t" + "Distance_from_Gene" +
          "\t" + "Intergenic_start" + "\t" + "Intergenic_end" + "\t" +
          "Num_SNPs" + "\t" + "Num_bases" + "\t" + "Het")
    new_fastdict = {}
    for scaffold in intergenDict.keys():
        counter = 0
        if scaffold in fastaDict.keys():
            #			print scaffold
コード例 #3
0
#!/usr/bin/python
from __future__ import division
import sys
from fasta import readfasta
from het import heterozygosity

fasta = sys.argv[1]

# Read fasta into a dictionary
with open(fasta, 'r') as f:
    fasta_dict = readfasta(f)

# Calculate heterozygosity for every scaffold
for key in fasta_dict.keys():
    het = heterozygosity(fasta_dict[key])
    print(key + "\t" + str(het[0]) + "\t" + str(het[1]) + "\t" + str(het[2]))
コード例 #4
0
#**********************************************
intron_dict = {}
for line in intron_coord:
    line = line.strip('\n').split('\t')
    key, value = line[0], line[1:]
    if key in intron_dict.keys():
        intron_dict[key].append(value)
    else:
        intron_dict[key] = [value]

#**************************************************************************
# Extract the intronic sequences from fasta and read them into a dictionary
#**************************************************************************
for scaf in intron_dict.keys():
    scaffold = scaf.split('_', 1)[0]
    if scaffold in fastaseq.keys():
        for coordinate in intron_dict.get(scaf):
            # If the sequence is ABCD and the coordinates are 1 and 3, because the
            # indexing is from 0 in Python, we would take the sequence from 0 to 3
            # because the last base is exclusive.
            intron_seq = fastaseq.get(
                scaffold)[int(coordinate[0]):int(coordinate[1])]
            print(scaf + '\t' + str(coordinate[0]) + '\t' +
                  str(coordinate[1]) + '\t' +
                  str(heterozygosity(intron_seq)[0]) + '\t' +
                  str(heterozygosity(intron_seq)[1]) + '\t' +
                  str(heterozygosity(intron_seq)[2]))

intron_coord.close()
fasta.close()
コード例 #5
0
        for coordinate in intergenDict.get(scaffold):
            counter = 0
            new_scaffold = fastaDict.get(
                scaffold)[int(coordinate[0]):int(coordinate[1]) + 1]
            length = len(new_scaffold)
            if length % 2 == 1:
                length = length + 1
            midpoint = int(length / 2)
            sequence_left = new_scaffold[0:midpoint]
            sequence_right = new_scaffold[:(midpoint - 1):-1]
            chunks_left = slidingWindow(sequence_left, winSize, step)
            for i in chunks_left:
                counter += 1
                end = str(int(coordinate[0]) + int(counter * winSize))
                start = str(int(end) - winSize)
                distance = str(abs(int(coordinate[0]) - int(end)))
                print scaffold, "left", counter, start, end, distance, coordinate[
                    0], coordinate[1], heterozygosity(i)
            counter = 0
            chunks_right = slidingWindow(sequence_right, winSize, step)
            for i in chunks_right:
                counter += 1
                end = str(int(coordinate[1]) - int(counter * winSize))
                start = str(int(end) + winSize)
                distance = str(int(coordinate[1]) - int(end))
                print scaffold, "right", counter, start, end, distance, coordinate[
                    0], coordinate[1], heterozygosity(i)

fastaseq.close()
intergenic.close()
コード例 #6
0
with open(window, 'r') as windowfile:
    for line in windowfile:
        line = line.strip('\n').split('\t')
        key, value = line[0], line[1:]
        if key in window_dict.keys():
            window_dict[key].append(value)
        else:
            window_dict[key] = [value]
# print window_dict

fasta_win = {}
for scaffold in window_dict.keys():
    for element in window_dict.get(scaffold):
        if scaffold in fasta_win.keys():
            fasta_win[scaffold].append(fasta_dict[scaffold][int(element[0]) -
                                                            1:int(element[1])])
        else:
            fasta_win[scaffold] = [
                fasta_dict[scaffold][int(element[0]) - 1:int(element[1])]
            ]

# print fasta_win

for scaffold in fasta_win.keys():
    for sequence in fasta_win.get(scaffold):
        index = fasta_win[scaffold].index(sequence)
        print scaffold, window_dict[scaffold][index][0], window_dict[scaffold][
            index][1], window_dict[scaffold][index][2], heterozygosity(
                sequence)[0], heterozygosity(sequence)[1], heterozygosity(
                    sequence)[2]