def ProcessSeqs(SEQ_HANDLE, PWMS, THRESHOLD, WANT_REV=False, bg=None): """Yields matches on sequences in an 'interval' formatted dictionary""" pwm_names = map(lambda x: x[0], PWMS) pwm_mats = map(lambda x: x[1], PWMS) thresh = map(lambda x: MOODS.threshold_from_p(x, bg, THRESHOLD), pwm_mats) for interval in ReadInterval(SEQ_HANDLE): print interval['NAME'] results = MOODS.search(interval['SEQ'].upper(), pwm_mats, thresh, both_strands=WANT_REV, algorithm='lf', absolute_threshold=True, bg=bg) for res, pwm_name, pwm_mat, th in zip(results, pwm_names, pwm_mats, thresh): width = len(pwm_mat[0]) for position, score in res: if score > th: yield { 'NAME': interval['NAME'], 'START': int(interval['START']) + position, 'END': int(interval['START']) + width + position, 'STRAND': interval['STRAND'], 'PWM': pwm_name, 'SCORE': score, 'CHROM': interval['CHROM'], 'SEQ': interval['SEQ'][position:(position + width)].upper() } else: print 'got bad result'
def ProcessCLI(args): outputDirectory = '/N/u/jubudka/Mason/BindingFiles/' weightMatrixDirectory = '/N/u/jubudka/Mason/PWMsmall/' sequencesFileName = 'FASTA_All_Merged_Encode.fasta' p_val = 0.0001 print args for i in xrange(len(args)): if args[i] == "-f": sequencesFileName = args[i+1] print "Fasta file is: ", sequencesFileName elif args[i] == "-p": weightMatrixDirectory = args[i+1] print "PWM file is: ", weightMatrixDirectory elif args[i] == "-o": outputDirectory = args[i+1] print "Output file is: ", outputDirectory elif args[i] == "-t": p_val = float(args[i+1]) if not os.path.exists(outputDirectory): os.makedirs(outputDirectory) # file for saving average score stuff # load position weight matrices # order is A C G T sequences = {} seqIDs = [] current_sequence = '' sequencesFile = open(sequencesFileName) aCount = 0 cCount = 0 gCount = 0 tCount = 0 totalLength = 0 for lines in sequencesFile: line = lines.strip() if line == '': continue if (line[0].startswith('>')): seqIDs.append(line[1:]) #add previous sequence to dictionary #create the reverse complement and add to dictionary #perform nucleotide counting #reset sequence to '' for next fasta sequence if (len(current_sequence) > 0): upper_current_sequence = current_sequence.upper() seqID = seqIDs.pop(0) sequences[seqID + ' ' + 'p'] = upper_current_sequence reverseSequence = reverse_complement(upper_current_sequence) sequences[seqID + ' ' + 'm'] = reverseSequence aCount = aCount + upper_current_sequence.count('A') cCount = cCount + upper_current_sequence.count('C') gCount = gCount + upper_current_sequence.count('G') tCount = tCount + upper_current_sequence.count('T') totalLength = totalLength + len(current_sequence) current_sequence = '' else: current_sequence += line upper_current_sequence = current_sequence.upper() seqID = seqIDs.pop(0) sequences[seqID + ' ' + 'p'] = upper_current_sequence reverseSequence = reverse_complement(upper_current_sequence) sequences[seqID + ' ' + 'm'] = reverseSequence aCount = aCount + upper_current_sequence.count('A') cCount = cCount + upper_current_sequence.count('C') gCount = gCount + upper_current_sequence.count('G') tCount = tCount + upper_current_sequence.count('T') totalLength = totalLength + len(current_sequence) aContent = aCount/float(totalLength) cContent = cCount/float(totalLength) gContent = gCount/float(totalLength) tContent = tCount/float(totalLength) backgroundScores = {'A':aContent, 'C':cContent, 'G':gContent, 'T':tContent} bg = [backgroundScores['A'], backgroundScores['C'], backgroundScores['G'], backgroundScores['T']] print bg matrix_names = [filename for filename in os.listdir(weightMatrixDirectory) if filename[-4:] == '.pfm'] pseudocount = 1 matrices = [MOODS.load_matrix(weightMatrixDirectory + filename) for filename in matrix_names] matrices = [MOODS.count_log_odds(matrix, bg, pseudocount) for matrix in matrices] thresholds = [MOODS.threshold_from_p(matrix, bg, p_val) for matrix in matrices] for (matrix, matrix_name, threshold) in zip(matrices, matrix_names, thresholds): motifLength = len(matrix[0]) if motifLength >= 18: matrix_mapper_long(matrix, matrix_name, threshold, outputDirectory, sequences) continue else: matrix_mapper(matrix, matrix_name, threshold, outputDirectory, sequences) print "Finished"
def count(enhlist, c, p_val): write_match = False index = 0 seq = "" while index <= len(enhlist) - 1 and not enhlist[index].startswith('>'): seq += enhlist[index] seq = seq.replace('\n','') index += 1 index = 0 results = [] pseudocount = 0.001 # starrmot15 # caca = [[49,0,100,0,91,0,100,0],[1,83,0,88,0,49,0,94],[49,1,0,12,0,9,0,6],[0,16,0,0,9,41,0,0,]] caca = [[0,1,0,1,0,1],[1,0,1,0,1,0],[0,0,0,0,0,0],[0,0,0,0,0,0]] me137 = [[0,1,0,1,0,1],[0,0,0,0,0,0],[1,0,1,0,1,0],[0,0,0,0,0,0]] gcgc = [[0,0,0,0,0,0],[0,1,0,1,0,1],[1,0,1,0,1,0],[0,0,0,0,0,0]] tata = [[0,1,0,1,0,1],[0,0,0,0,0,0],[0,0,0,0,0,0],[1,0,1,0,1,0]] matrices = [me137,caca,gcgc,tata] if (c == "default"): results = MOODS.search(seq,matrices,p_val,both_strands=True) elif (c == "human"): BG_hum =[0.29508855202553025, 0.20466109233964447, 0.20478482916547036, 0.2954655264693549] matrices = [MOODS.count_log_odds(matrix,BG_hum,pseudocount) for matrix in matrices] thresholds = [MOODS.threshold_from_p(matrix, BG_hum, p_val) for matrix in matrices] results = MOODS.search(seq, matrices, thresholds, convert_log_odds=False, threshold_from_p=False,both_strands=True) elif (c == "equal"): BG_eq = [0.25,0.25,0.25,0.25] matrices = [MOODS.count_log_odds(matrix,BG_eq,pseudocount) for matrix in matrices] thresholds = [MOODS.threshold_from_p(matrix, BG_eq, p_val) for matrix in matrices] results = MOODS.search(seq, matrices, thresholds, convert_log_odds=False, threshold_from_p=False,both_strands=True) else: write_match = False ga = re.compile(r'GAGA(?:GA)+',re.IGNORECASE) ct = re.compile(r'TCTC(?:TC)+',re.IGNORECASE) ca = re.compile(r'CACA(?:CA)+',re.IGNORECASE) gt = re.compile(r'TGTG(?:TG)+',re.IGNORECASE) gc = re.compile(r'GCGC(?:GC)+',re.IGNORECASE) cg = re.compile(r'CGCG(?:CG)+',re.IGNORECASE) ta = re.compile(r'TATA(?:TA)+',re.IGNORECASE) at = re.compile(r'ATAT(?:AT)+',re.IGNORECASE) ga_occur = re.findall(ga,seq) + re.findall(ct,seq) ca_occur = re.findall(ca,seq) + re.findall(gt,seq) gc_occur = re.findall(gc,seq) + re.findall(cg,seq) ta_occur = re.findall(ta,seq) + re.findall(at,seq) results = [ga_occur, ca_occur, gc_occur, ta_occur] # write files containing the matches for each motif as found by MOODS # only if boolean set at top of count() is True if write_match: ga = open('%sga_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a') ca = open('%sca_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a') gc = open('%sgc_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a') ta = open('%sta_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a') comp = False for match in results[0]: pos = match[0] st = "" if (pos < 0): comp = True pos += len(seq) for i in range(len(me137[1])): st += seq[pos+i] if comp: st = complement(st) comp = False ga.write('%s\n' % st) for match in results[1]: pos = match[0] st = "" if (pos < 0): comp = True pos += len(seq) for i in range(len(caca[1])): st += seq[pos+i] if comp: st = complement(st) comp = False ca.write('%s\n' % st) for match in results[2]: pos = match[0] st = "" if (pos < 0): comp = True pos += len(seq) for i in range(len(gcgc[1])): st += seq[pos+i] if comp: st = complement(st) comp = False gc.write('%s\n' % st) for match in results[3]: pos = match[0] st = "" if (pos < 0): comp = True pos += len(seq) for i in range(len(tata[1])): st += seq[pos+i] if comp: st = complement(st) comp = False ta.write('%s\n' % st) ga.close() ca.close() gc.close() ta.close() results.append(len(seq)) return results