예제 #1
0
def get_2d_best_separation(sequences1, sequences2, max_distance, length_range):
    tail = max(length_range)
    d_range = range(tail, max_distance)
    arr = np.zeros((len(list(d_range)), len(list(length_range))))
    data = {}

    for i, d in enumerate(d_range):
        local_sequences1 = [x[:d] for x in sequences1]
        local_sequences2 = [x[:d] for x in sequences2]
        for j, length in enumerate(length_range):
            temp_list1, temp_list2 = [], []
            for seq in local_sequences1:
                temp_list1.append(
                    int(
                        max([
                            get_at_content(x)
                            for x in sliding_window(seq, length)
                        ]) * 1000))
            for seq in local_sequences2:
                temp_list2.append(
                    int(
                        max([
                            get_at_content(x)
                            for x in sliding_window(seq, length)
                        ]) * 1000))

            threshold, score, passed1, passed2 = separation_score(
                temp_list1, temp_list2)
            arr[i, j] = score
            data[(i, j)] = (d, length, threshold, passed1, passed2)

            #arr[i,j] = np.mean(temp_list)
    return arr, data
예제 #2
0
def get_pam_sequences(seqrecord, chrname):
    minstart = PAM_LENGTH
    lseq = len(seqrecord)

    pam = ("C", "C")

    #forward
    #reverse = seqrecord.reverse_complement()
    #for pos, dn in enumerate(sliding_window(reverse[minstart:], 2), start=minstart):
    #if(dn == pam):
    #start = lseq - pos - 2
    #stop = start + 2 + PAM_LENGTH
    #print(seqrecord[start:stop])

    #forward
    for pos, dn in enumerate(sliding_window(seqrecord, 2)):
        if (dn == pam):
            start = pos
            stop = start + 2 + PAM_LENGTH
            seq = str(seqrecord[start:stop])
            if (len(seq) == PAM_LENGTH + 2):
                #sys.stderr.write("%s\n" % seq)
                print(">%s|%d|%d|+\n%s" % (chrname, start, stop, seq))
            #print(seqrecord[start:stop])

    #reverse
    reverse = seqrecord.reverse_complement()
    for pos, dn in enumerate(sliding_window(reverse, 2)):
        if (dn == pam):
            start = lseq - pos - 2 - PAM_LENGTH
            stop = start + 2 + PAM_LENGTH
            seq = str(seqrecord[start:stop].reverse_complement())
            if (len(seq) == PAM_LENGTH + 2):
                print(">%s|%d|%d|-\n%s" % (chrname, start, stop, seq))
예제 #3
0
def get_at_flanks_max(interval, genome, flank, window):
    size = window * 2 + 1
    f1 = max([
        get_at_content(x) for x in sliding_window(
            genome[interval.chrom][interval.start -
                                   flank:interval.start].seq, size)
    ])
    f2 = max([
        get_at_content(x) for x in sliding_window(
            genome[interval.chrom][interval.stop:interval.stop +
                                   flank].seq, size)
    ])
    return (f1 + f2) / 2
예제 #4
0
파일: peaks.py 프로젝트: afilipch/afp
def detect_peaks(signal):
    locs = []
    ispeak = False
    start = 0

    for c, window in enumerate(sliding_window(signal, 3)):
        if ((window[0] > window[1] < window[2])):
            if (ispeak):
                end = c + 1
                ispeak = False
                locs.append((start, top, end, height))
                start = c + 1
            else:
                start = c + 1

        if (window[1] * window[2] < 0):
            if (ispeak):
                end = c + 2
                ispeak = False
                locs.append((start, top, end, height))
            else:
                start = c + 2

        if ((window[0] < window[1] > window[2]) and window[1] > 0):
            top = c + 1
            ispeak = True
            height = window[1]
    return locs
예제 #5
0
def get_at_profile(regions, genome):
    res = []
    for region in regions:
        chrom = genome[region.chrom]
        seq = str(chrom[region.start - 10:region.stop + 10].seq.upper())
        res.append([get_at_content(x) for x in sliding_window(seq, 20)])

    res = np.array(res)
    return res.mean(axis=0)
def get_peak_at_content_max(regions, genome, at_length):
    res = []
    for region in regions:
        chrom = genome[region.chrom]
        seq = str(chrom[region.start:region.stop].seq.upper())
        max_at = max(
            [get_at_content(x) for x in sliding_window(seq, at_length)])
        res.append(max_at)
    return res
예제 #7
0
def transcript_content(interval, genome, window):
    #print(type(interval.start))
    if (interval.strand == '+'):
        seq = str(
            genome[interval.chrom][interval.start:interval.stop].seq.upper())
    elif (interval.strand == '-'):
        seq = str(
            genome[interval.chrom]
            [interval.start:interval.stop].seq.reverse_complement().upper())

    profile = [get_at_content(x) for x in sliding_window(seq, window)]
    return array2fixed_length(profile, 100)
예제 #8
0
def get_at_dict(genome, window, mask):
    at_dict = {}
    masked = mask - window
    frame = window * 2 + 1
    for chrom, seq in genome.items():
        at = [
            get_at_content(x)
            for x in sliding_window(seq[masked:-masked], frame)
        ]
        at = [0] * mask + at + [0] * mask
        at_dict[chrom] = np.array(at)
        #print(at[:20])
        #print(str(seq[:30].seq))
    return at_dict
예제 #9
0
def transcript2upstream(interval, genome, window, lookup):

    seq = None
    chrom = genome[interval.chrom]
    if (interval.strand == '+'):
        start = interval.start - lookup - window
        end = interval.start
        if (start >= 0):
            seq = str(chrom[start:end].seq.reverse_complement().upper())
    elif (interval.strand == '-'):
        start = interval.stop
        end = interval.stop + lookup + window
        if (end < len(chrom)):
            seq = str(chrom[start:end].seq.upper())
    if (seq):
        return [get_at_content(x) for x in sliding_window(seq, window)]
예제 #10
0
def max_at_mindistance_length(sequences, max_distance, length_range):

    tail = max(length_range)
    d_range = range(0, max_distance - tail)
    arr = np.zeros((len(list(d_range)), len(list(length_range))))

    for i, d in enumerate(d_range):
        local_sequences = [x[d:] for x in sequences]
        for j, length in enumerate(length_range):
            temp_list = []
            for seq in local_sequences:
                temp_list.append(
                    max([
                        get_at_content(x) for x in sliding_window(seq, length)
                    ]))
            arr[i, j] = np.mean(temp_list)
    return arr
예제 #11
0
def get_at_rich_stretches(seq, anchor_length, minlength, maxgc_num,
                          minat_fraction):
    upper_limit = 0
    for position, window in enumerate(sliding_window(seq, anchor_length)):
        if (position >= upper_limit and 'G' not in window
                and 'C' not in window):
            start, end = get_extensions(seq[upper_limit:],
                                        position - upper_limit, anchor_length,
                                        maxgc_num, minat_fraction)
            start = max(start, 0)
            if (end - start >= minlength):
                adstart = start + upper_limit
                adend = end + upper_limit
                upper_limit = adend
                #print(start, end)
                lseq = seq[adstart:adend]
                yield (adstart, adend, lseq, get_at_fraction(lseq),
                       lseq.count('G') + lseq.count('C'))
예제 #12
0
파일: peaks.py 프로젝트: afilipch/afp
def local_generator(extended, wsize, scaled):
    for window in sliding_window(extended, wsize):
        yield (window, scaled)
예제 #13
0
def count_nmers(sequences, length):
    counter = defaultdict(int);
    for seq in sequences:
        for nmer in set(sliding_window(seq, length)):
            counter[nmer] += 1;
    return sorted(counter.items(), key = lambda x: x[1], reverse = True);
예제 #14
0
def get_gene_at_profile(interval, genome, window):
    flank = window // 2
    seq = genome[interval.chrom][interval.start - flank:interval.stop +
                                 flank].seq
    seq = str(seq.upper())
    return [get_at_content(x) for x in sliding_window(seq, window + 1)]
예제 #15
0

def get_flanks(window):
    for c in range(int(len(window) / 2)):
        yield window[:c * 2 + 1]


def get_flanks_backward(window):
    for c in range(int(len(window) / 2)):
        yield window[c * 2 + 2:]


for seqrec in SeqIO.parse(args.path, 'fasta'):
    chrom = seqrec.id
    seq = seqrec.seq.upper()
    position = 0
    for count, window in enumerate(sliding_window(seq, wlen)):
        if (count == 0):
            for flank in get_flanks(window):
                sys.stdout.write("%s\t%d\t%1.5f\n" %
                                 (chrom, position, get_at_content(flank)))
                position += 1
        sys.stdout.write("%s\t%d\t%1.5f\n" %
                         (chrom, position, get_at_content(window)))
        position += 1
    else:
        for flank in get_flanks_backward(window):
            sys.stdout.write("%s\t%d\t%1.5f\n" %
                             (chrom, position, get_at_content(flank)))
            position += 1
예제 #16
0
                    required=True,
                    type=str,
                    help="Path to the output directory")
args = parser.parse_args()

kmer_range = range(args.kmer[0], args.kmer[1] + 1)

for size in kmer_range:
    kmers_count = defaultdict(int)
    norma = 0
    with open(os.path.join(args.outdir, 'kmer_%d.tsv' % size), 'w') as f:
        for seqrecord in SeqIO.parse(args.path, 'fasta'):
            seqlength = len(seqrecord)
            norma += 1
            curset = set()
            for kmer in sliding_window(seqrecord.seq, size):
                curset.add(kmer)
            if (args.reverse):
                for kmer in sliding_window(seqrecord.seq.reverse_complement(),
                                           size):
                    curset.add(kmer)
            for el in curset:
                kmers_count[el] += 1
        meanfr = (seqlength - size + 1) / (4**size)
        if (args.reverse):
            meanfr *= 2

        kmers_count = [(x[0], x[1] / norma) for x in kmers_count.items()]
        kmers_count.sort(key=lambda x: x[1], reverse=True)
        for kmer, fraction in kmers_count[:args.top]:
            f.write("%s\t%f\t%1.2f\n" %
예제 #17
0
def smooth_coverage(coverage, flen):
    length = 2*flen+1
    res = [np.mean(x) for x in sliding_window(coverage, length)]
    res = [res[0]]*flen + res + [res[-1]]*flen
    return np.array(res);