Example #1
0
def ProcessSeqs(SEQ_HANDLE, PWMS, THRESHOLD, WANT_REV=False, bg=None):
    """Yields matches on sequences in an 'interval' formatted dictionary"""

    pwm_names = map(lambda x: x[0], PWMS)
    pwm_mats = map(lambda x: x[1], PWMS)
    thresh = map(lambda x: MOODS.threshold_from_p(x, bg, THRESHOLD), pwm_mats)

    for interval in ReadInterval(SEQ_HANDLE):
        print interval['NAME']

        results = MOODS.search(interval['SEQ'].upper(),
                               pwm_mats,
                               thresh,
                               both_strands=WANT_REV,
                               algorithm='lf',
                               absolute_threshold=True,
                               bg=bg)

        for res, pwm_name, pwm_mat, th in zip(results, pwm_names, pwm_mats,
                                              thresh):
            width = len(pwm_mat[0])
            for position, score in res:
                if score > th:
                    yield {
                        'NAME': interval['NAME'],
                        'START': int(interval['START']) + position,
                        'END': int(interval['START']) + width + position,
                        'STRAND': interval['STRAND'],
                        'PWM': pwm_name,
                        'SCORE': score,
                        'CHROM': interval['CHROM'],
                        'SEQ':
                        interval['SEQ'][position:(position + width)].upper()
                    }
                else:
                    print 'got bad result'
Example #2
0
def ProcessCLI(args):
    
    outputDirectory = '/N/u/jubudka/Mason/BindingFiles/'
    weightMatrixDirectory = '/N/u/jubudka/Mason/PWMsmall/'
    sequencesFileName = 'FASTA_All_Merged_Encode.fasta'
    p_val = 0.0001

    print args
    for i in xrange(len(args)):
        if args[i] == "-f":
	    sequencesFileName = args[i+1]
	    print "Fasta file is: ", sequencesFileName
	elif args[i] == "-p":
	    weightMatrixDirectory = args[i+1]
	    print "PWM file is: ", weightMatrixDirectory
	elif args[i] == "-o":
	    outputDirectory = args[i+1]
	    print "Output file is: ", outputDirectory
	elif args[i] == "-t":
	    p_val = float(args[i+1])

    if not os.path.exists(outputDirectory):
	os.makedirs(outputDirectory)
	

    # file for saving average score stuff
    # load position weight matrices
    # order is A C G T
    sequences = {}
    seqIDs = []
    current_sequence = ''
    sequencesFile = open(sequencesFileName)

    aCount = 0
    cCount = 0
    gCount = 0
    tCount = 0
    totalLength = 0

    for lines in sequencesFile:
	line = lines.strip()
	if line == '':
	    continue
	if (line[0].startswith('>')):
	    seqIDs.append(line[1:])
	    #add previous sequence to dictionary
	    #create the reverse complement and add to dictionary
	    #perform nucleotide counting
	    #reset sequence to '' for next fasta sequence
	    if (len(current_sequence) > 0):
		upper_current_sequence = current_sequence.upper()
		seqID = seqIDs.pop(0)
		sequences[seqID + ' ' + 'p'] = upper_current_sequence
		reverseSequence = reverse_complement(upper_current_sequence)
		sequences[seqID + ' ' + 'm'] = reverseSequence
		aCount = aCount + upper_current_sequence.count('A')
                cCount = cCount + upper_current_sequence.count('C')
                gCount = gCount + upper_current_sequence.count('G')
                tCount = tCount + upper_current_sequence.count('T')
                totalLength = totalLength + len(current_sequence)

	    current_sequence = ''
	else:
	    current_sequence += line

    upper_current_sequence = current_sequence.upper()
    seqID = seqIDs.pop(0)
    sequences[seqID + ' ' + 'p'] = upper_current_sequence
    reverseSequence = reverse_complement(upper_current_sequence)
    sequences[seqID + ' ' + 'm'] = reverseSequence

    aCount = aCount + upper_current_sequence.count('A')
    cCount = cCount + upper_current_sequence.count('C')
    gCount = gCount + upper_current_sequence.count('G')
    tCount = tCount + upper_current_sequence.count('T')
    totalLength = totalLength + len(current_sequence)

    aContent = aCount/float(totalLength)
    cContent = cCount/float(totalLength)
    gContent = gCount/float(totalLength)
    tContent = tCount/float(totalLength)
  
    backgroundScores = {'A':aContent, 'C':cContent, 'G':gContent, 'T':tContent}
    bg = [backgroundScores['A'], backgroundScores['C'], backgroundScores['G'], backgroundScores['T']]
    print bg

    matrix_names = [filename for filename in os.listdir(weightMatrixDirectory) if filename[-4:] == '.pfm']
    pseudocount = 1

    matrices = [MOODS.load_matrix(weightMatrixDirectory + filename) for filename in matrix_names]

    matrices = [MOODS.count_log_odds(matrix, bg, pseudocount) for matrix in matrices]

    thresholds = [MOODS.threshold_from_p(matrix, bg, p_val) for matrix in matrices]


    for (matrix, matrix_name, threshold) in zip(matrices, matrix_names, thresholds):
	    motifLength = len(matrix[0])
	    if motifLength >= 18:
		matrix_mapper_long(matrix, matrix_name, threshold, outputDirectory, sequences)
		continue
	    else:
		matrix_mapper(matrix, matrix_name, threshold, outputDirectory, sequences)

    print "Finished"		
Example #3
0
def count(enhlist, c, p_val):
	write_match = False
	index = 0
	seq = ""
	while index <= len(enhlist) - 1 and not enhlist[index].startswith('>'):
		seq += enhlist[index]
		seq = seq.replace('\n','')
		index += 1
	index = 0
	results = []
	pseudocount = 0.001
#	starrmot15
#	caca = [[49,0,100,0,91,0,100,0],[1,83,0,88,0,49,0,94],[49,1,0,12,0,9,0,6],[0,16,0,0,9,41,0,0,]]
	caca = [[0,1,0,1,0,1],[1,0,1,0,1,0],[0,0,0,0,0,0],[0,0,0,0,0,0]]
	me137 = [[0,1,0,1,0,1],[0,0,0,0,0,0],[1,0,1,0,1,0],[0,0,0,0,0,0]]
	gcgc = [[0,0,0,0,0,0],[0,1,0,1,0,1],[1,0,1,0,1,0],[0,0,0,0,0,0]]
	tata = [[0,1,0,1,0,1],[0,0,0,0,0,0],[0,0,0,0,0,0],[1,0,1,0,1,0]]
	matrices = [me137,caca,gcgc,tata]

	if (c == "default"):
		results = MOODS.search(seq,matrices,p_val,both_strands=True)

	elif (c == "human"):
		BG_hum =[0.29508855202553025, 0.20466109233964447, 0.20478482916547036, 0.2954655264693549]
	  	matrices = [MOODS.count_log_odds(matrix,BG_hum,pseudocount) for matrix in matrices]
	  	thresholds = [MOODS.threshold_from_p(matrix, BG_hum, p_val) for matrix in matrices]
	  	results = MOODS.search(seq, matrices, thresholds, convert_log_odds=False, threshold_from_p=False,both_strands=True)

	elif (c == "equal"):
		BG_eq = [0.25,0.25,0.25,0.25]
	  	matrices = [MOODS.count_log_odds(matrix,BG_eq,pseudocount) for matrix in matrices]
	  	thresholds = [MOODS.threshold_from_p(matrix, BG_eq, p_val) for matrix in matrices]
	  	results = MOODS.search(seq, matrices, thresholds, convert_log_odds=False, threshold_from_p=False,both_strands=True)

	else:
		write_match = False
		ga = re.compile(r'GAGA(?:GA)+',re.IGNORECASE)
		ct = re.compile(r'TCTC(?:TC)+',re.IGNORECASE)
		ca = re.compile(r'CACA(?:CA)+',re.IGNORECASE)
		gt = re.compile(r'TGTG(?:TG)+',re.IGNORECASE)
		gc = re.compile(r'GCGC(?:GC)+',re.IGNORECASE)
		cg = re.compile(r'CGCG(?:CG)+',re.IGNORECASE)
		ta = re.compile(r'TATA(?:TA)+',re.IGNORECASE)
		at = re.compile(r'ATAT(?:AT)+',re.IGNORECASE)

		ga_occur = re.findall(ga,seq) + re.findall(ct,seq)
		ca_occur = re.findall(ca,seq) + re.findall(gt,seq)
		gc_occur = re.findall(gc,seq) + re.findall(cg,seq)
		ta_occur = re.findall(ta,seq) + re.findall(at,seq)
		results = [ga_occur, ca_occur, gc_occur, ta_occur]
#	write files containing the matches for each motif as found by MOODS
#   only if boolean set at top of count() is True
	if write_match:
		ga = open('%sga_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a')
		ca = open('%sca_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a')
		gc = open('%sgc_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a')
		ta = open('%sta_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a')
		comp = False
		for match in results[0]:
			pos = match[0]
			st = ""
			if (pos < 0):
				comp = True
				pos += len(seq)
			for i in range(len(me137[1])):
				st += seq[pos+i]
			if comp:
				st = complement(st)
				comp = False
			ga.write('%s\n' % st)

		for match in results[1]:
			pos = match[0]
			st = ""
			if (pos < 0):
				comp = True
				pos += len(seq)
			for i in range(len(caca[1])):
				st += seq[pos+i]
			if comp:
				st = complement(st)
				comp = False
			ca.write('%s\n' % st)

		for match in results[2]:
			pos = match[0]
			st = ""
			if (pos < 0):
				comp = True
				pos += len(seq)
			for i in range(len(gcgc[1])):
				st += seq[pos+i]
			if comp:
				st = complement(st)
				comp = False
			gc.write('%s\n' % st)

		for match in results[3]:
			pos = match[0]
			st = ""
			if (pos < 0):
				comp = True
				pos += len(seq)

			for i in range(len(tata[1])):
				st += seq[pos+i]
			if comp:
				st = complement(st)
				comp = False
			ta.write('%s\n' % st)

		ga.close()
		ca.close()
		gc.close()
		ta.close()
	results.append(len(seq))
	return results