def get_2d_best_separation(sequences1, sequences2, max_distance, length_range): tail = max(length_range) d_range = range(tail, max_distance) arr = np.zeros((len(list(d_range)), len(list(length_range)))) data = {} for i, d in enumerate(d_range): local_sequences1 = [x[:d] for x in sequences1] local_sequences2 = [x[:d] for x in sequences2] for j, length in enumerate(length_range): temp_list1, temp_list2 = [], [] for seq in local_sequences1: temp_list1.append( int( max([ get_at_content(x) for x in sliding_window(seq, length) ]) * 1000)) for seq in local_sequences2: temp_list2.append( int( max([ get_at_content(x) for x in sliding_window(seq, length) ]) * 1000)) threshold, score, passed1, passed2 = separation_score( temp_list1, temp_list2) arr[i, j] = score data[(i, j)] = (d, length, threshold, passed1, passed2) #arr[i,j] = np.mean(temp_list) return arr, data
def get_pam_sequences(seqrecord, chrname): minstart = PAM_LENGTH lseq = len(seqrecord) pam = ("C", "C") #forward #reverse = seqrecord.reverse_complement() #for pos, dn in enumerate(sliding_window(reverse[minstart:], 2), start=minstart): #if(dn == pam): #start = lseq - pos - 2 #stop = start + 2 + PAM_LENGTH #print(seqrecord[start:stop]) #forward for pos, dn in enumerate(sliding_window(seqrecord, 2)): if (dn == pam): start = pos stop = start + 2 + PAM_LENGTH seq = str(seqrecord[start:stop]) if (len(seq) == PAM_LENGTH + 2): #sys.stderr.write("%s\n" % seq) print(">%s|%d|%d|+\n%s" % (chrname, start, stop, seq)) #print(seqrecord[start:stop]) #reverse reverse = seqrecord.reverse_complement() for pos, dn in enumerate(sliding_window(reverse, 2)): if (dn == pam): start = lseq - pos - 2 - PAM_LENGTH stop = start + 2 + PAM_LENGTH seq = str(seqrecord[start:stop].reverse_complement()) if (len(seq) == PAM_LENGTH + 2): print(">%s|%d|%d|-\n%s" % (chrname, start, stop, seq))
def get_at_flanks_max(interval, genome, flank, window): size = window * 2 + 1 f1 = max([ get_at_content(x) for x in sliding_window( genome[interval.chrom][interval.start - flank:interval.start].seq, size) ]) f2 = max([ get_at_content(x) for x in sliding_window( genome[interval.chrom][interval.stop:interval.stop + flank].seq, size) ]) return (f1 + f2) / 2
def detect_peaks(signal): locs = [] ispeak = False start = 0 for c, window in enumerate(sliding_window(signal, 3)): if ((window[0] > window[1] < window[2])): if (ispeak): end = c + 1 ispeak = False locs.append((start, top, end, height)) start = c + 1 else: start = c + 1 if (window[1] * window[2] < 0): if (ispeak): end = c + 2 ispeak = False locs.append((start, top, end, height)) else: start = c + 2 if ((window[0] < window[1] > window[2]) and window[1] > 0): top = c + 1 ispeak = True height = window[1] return locs
def get_at_profile(regions, genome): res = [] for region in regions: chrom = genome[region.chrom] seq = str(chrom[region.start - 10:region.stop + 10].seq.upper()) res.append([get_at_content(x) for x in sliding_window(seq, 20)]) res = np.array(res) return res.mean(axis=0)
def get_peak_at_content_max(regions, genome, at_length): res = [] for region in regions: chrom = genome[region.chrom] seq = str(chrom[region.start:region.stop].seq.upper()) max_at = max( [get_at_content(x) for x in sliding_window(seq, at_length)]) res.append(max_at) return res
def transcript_content(interval, genome, window): #print(type(interval.start)) if (interval.strand == '+'): seq = str( genome[interval.chrom][interval.start:interval.stop].seq.upper()) elif (interval.strand == '-'): seq = str( genome[interval.chrom] [interval.start:interval.stop].seq.reverse_complement().upper()) profile = [get_at_content(x) for x in sliding_window(seq, window)] return array2fixed_length(profile, 100)
def get_at_dict(genome, window, mask): at_dict = {} masked = mask - window frame = window * 2 + 1 for chrom, seq in genome.items(): at = [ get_at_content(x) for x in sliding_window(seq[masked:-masked], frame) ] at = [0] * mask + at + [0] * mask at_dict[chrom] = np.array(at) #print(at[:20]) #print(str(seq[:30].seq)) return at_dict
def transcript2upstream(interval, genome, window, lookup): seq = None chrom = genome[interval.chrom] if (interval.strand == '+'): start = interval.start - lookup - window end = interval.start if (start >= 0): seq = str(chrom[start:end].seq.reverse_complement().upper()) elif (interval.strand == '-'): start = interval.stop end = interval.stop + lookup + window if (end < len(chrom)): seq = str(chrom[start:end].seq.upper()) if (seq): return [get_at_content(x) for x in sliding_window(seq, window)]
def max_at_mindistance_length(sequences, max_distance, length_range): tail = max(length_range) d_range = range(0, max_distance - tail) arr = np.zeros((len(list(d_range)), len(list(length_range)))) for i, d in enumerate(d_range): local_sequences = [x[d:] for x in sequences] for j, length in enumerate(length_range): temp_list = [] for seq in local_sequences: temp_list.append( max([ get_at_content(x) for x in sliding_window(seq, length) ])) arr[i, j] = np.mean(temp_list) return arr
def get_at_rich_stretches(seq, anchor_length, minlength, maxgc_num, minat_fraction): upper_limit = 0 for position, window in enumerate(sliding_window(seq, anchor_length)): if (position >= upper_limit and 'G' not in window and 'C' not in window): start, end = get_extensions(seq[upper_limit:], position - upper_limit, anchor_length, maxgc_num, minat_fraction) start = max(start, 0) if (end - start >= minlength): adstart = start + upper_limit adend = end + upper_limit upper_limit = adend #print(start, end) lseq = seq[adstart:adend] yield (adstart, adend, lseq, get_at_fraction(lseq), lseq.count('G') + lseq.count('C'))
def local_generator(extended, wsize, scaled): for window in sliding_window(extended, wsize): yield (window, scaled)
def count_nmers(sequences, length): counter = defaultdict(int); for seq in sequences: for nmer in set(sliding_window(seq, length)): counter[nmer] += 1; return sorted(counter.items(), key = lambda x: x[1], reverse = True);
def get_gene_at_profile(interval, genome, window): flank = window // 2 seq = genome[interval.chrom][interval.start - flank:interval.stop + flank].seq seq = str(seq.upper()) return [get_at_content(x) for x in sliding_window(seq, window + 1)]
def get_flanks(window): for c in range(int(len(window) / 2)): yield window[:c * 2 + 1] def get_flanks_backward(window): for c in range(int(len(window) / 2)): yield window[c * 2 + 2:] for seqrec in SeqIO.parse(args.path, 'fasta'): chrom = seqrec.id seq = seqrec.seq.upper() position = 0 for count, window in enumerate(sliding_window(seq, wlen)): if (count == 0): for flank in get_flanks(window): sys.stdout.write("%s\t%d\t%1.5f\n" % (chrom, position, get_at_content(flank))) position += 1 sys.stdout.write("%s\t%d\t%1.5f\n" % (chrom, position, get_at_content(window))) position += 1 else: for flank in get_flanks_backward(window): sys.stdout.write("%s\t%d\t%1.5f\n" % (chrom, position, get_at_content(flank))) position += 1
required=True, type=str, help="Path to the output directory") args = parser.parse_args() kmer_range = range(args.kmer[0], args.kmer[1] + 1) for size in kmer_range: kmers_count = defaultdict(int) norma = 0 with open(os.path.join(args.outdir, 'kmer_%d.tsv' % size), 'w') as f: for seqrecord in SeqIO.parse(args.path, 'fasta'): seqlength = len(seqrecord) norma += 1 curset = set() for kmer in sliding_window(seqrecord.seq, size): curset.add(kmer) if (args.reverse): for kmer in sliding_window(seqrecord.seq.reverse_complement(), size): curset.add(kmer) for el in curset: kmers_count[el] += 1 meanfr = (seqlength - size + 1) / (4**size) if (args.reverse): meanfr *= 2 kmers_count = [(x[0], x[1] / norma) for x in kmers_count.items()] kmers_count.sort(key=lambda x: x[1], reverse=True) for kmer, fraction in kmers_count[:args.top]: f.write("%s\t%f\t%1.2f\n" %
def smooth_coverage(coverage, flen): length = 2*flen+1 res = [np.mean(x) for x in sliding_window(coverage, length)] res = [res[0]]*flen + res + [res[-1]]*flen return np.array(res);