예제 #1
0
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True):
    for fastaf in fastafile:
        thisseqname = fastaf.split('/')[-1].split('.')[0]
        handle = open(fastaf, "r")
        records = list(Bio.SeqIO.parse(handle, "fasta"))
        handle.close()
        thisseq = records[0].seq
        print 'Doing sequence ', thisseqname, 'length=', len(thisseq)
        for pwmf in pwmfiles:
            thispwmname = pwmf.split('/')[-1]
            print ' Doing pwm ', thispwmname
            thispwm = MOODS.load_matrix(pwmf)
            thispwmcomplement = MOODS.reverse_complement(thispwm)
            
            print '  strand 1'
            onestrandindexvector=getMOODSscore(thisseq, thispwm)
            print '  strand 2'
            otherstrandindexvecor=getMOODSscore(thisseq, thispwmcomplement)
            print '  finding best score per bp'
            bothstrandsindexvector = np.append( onestrandindexvector, otherstrandindexvecor, axis=0)
            bestscorevector = getMaxPWMScore( bothstrandsindexvector, len(thispwm[0]))
            
            for strandnbr in range(len(bothstrandsindexvector)):
                print '  writing wiggle for strand', str(strandnbr)
                vegardswritewiggle(bothstrandsindexvector[strandnbr,:], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/'+ thispwmname+ '/strand_'+str(strandnbr))
            
            print '  writing wiggle for bestscore'
            vegardswritewiggle(bestscorevector, name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/'+thispwmname)
def getMOODSscore(seqfile, pwmfiles, both_strands=False):
    handle = open(seqfile, "r")
    records = list(Bio.SeqIO.parse(handle, "fasta"))
    handle.close()
    seq = records[0].seq
    print 'len(seq)=', len(seq)
    matrixlist = list()
    for f in pwmfiles:
        matrix = MOODS.load_matrix(f)
        print 'pwm ', f, 'windowlength=', len(matrix[0])
        matrixlist.append(matrix)
        if both_strands:
            matrixlist.append(
                MOODS.reverse_complement(matrix)
            )  # both_strand option in MOODS returned a akward result.
    print 'starting MOODS.search', datetime.now()
    results = MOODS.search(seq,
                           matrixlist,
                           thresholds=1,
                           absolute_threshold=False)
    print 'done MOODS.search', datetime.now()
    reslist = []
    for n in range(len(pwmfiles)):
        thisind = n * (1 + both_strands)

        reslist.append(vegardparseMOODSres(results[thisind], len(seq)))
        if both_strands:
            reslist[n] = np.append(reslist[n],
                                   vegardparseMOODSres(results[thisind + 1],
                                                       len(seq)),
                                   axis=0)
    return (reslist)
예제 #3
0
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True):
    for fastaf in fastafile:
        ### seqence only needed for length here. MOODS does this parsing again later but without reporting length.
        thisseqname = fastaf.split('/')[-1].split('.')[0]
        handle = open(fastaf, "r")
        records = list(Bio.SeqIO.parse(handle, "fasta"))
        handle.close()
        thisseq = records[0].seq
        print 'Doing sequence ', thisseqname, 'length=', len(thisseq)

        for pwmf in pwmfiles:
            thispwmname = pwmf.split('/')[-1]
            thispwm = MOODS.load_matrix(pwmf)
            print ' Doing MOODS both strands for pwm ', thispwmname, ', length=', len(
                thispwm[0]), datetime.now()
            onestrandsindexvector = getMOODSscore(fastaf, pwmf, len(thisseq))
            print '  bp with no score (given ', NO_SCORE_VALUE, ') is ', (
                onestrandsindexvector == NO_SCORE_VALUE).sum(), ' expected ', (
                    len(thispwm[0]) - 1)
            print '  finding best score per bp, ', datetime.now()
            bestscorevector = getMaxPWMScore(onestrandsindexvector,
                                             len(thispwm[0])),

            print '  writing wiggle for score per start index.', datetime.now()
            vegardswritewiggle(onestrandsindexvector[0, ],
                               name=thispwmname,
                               chr=thisseqname,
                               destpath=destdir + '/' + 'start_index_score/' +
                               thispwmname)

            print '  writing wiggle for bestscore. ', datetime.now()
            vegardswritewiggle(bestscorevector[0],
                               name=thispwmname,
                               chr=thisseqname,
                               destpath=destdir + '/' +
                               'best_score_in_window/' + thispwmname)
예제 #4
0
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True):
    for fastaf in fastafile:
        ### seqence only needed for length here. MOODS does this parsing again later but without reporting length.
        thisseqname = fastaf.split('/')[-1].split('.')[0]
        handle = open(fastaf, "r")
        records = list(Bio.SeqIO.parse(handle, "fasta"))
        handle.close()
        thisseq = records[0].seq
        print 'Doing sequence ', thisseqname, 'length=', len(thisseq)
        
        for pwmf in pwmfiles:
            thispwmname = pwmf.split('/')[-1]
            thispwm = MOODS.load_matrix(pwmf)
            print ' Doing MOODS both strands for pwm ', thispwmname, ', length=', len(thispwm[0]), datetime.now()
            onestrandsindexvector = getMOODSscore(fastaf, pwmf, len(thisseq))
            print '  bp with no score (given ', NO_SCORE_VALUE,  ') is ', (onestrandsindexvector == NO_SCORE_VALUE).sum(), ' expected ', (len(thispwm[0])-1)
            print '  finding best score per bp, ', datetime.now()
            bestscorevector = getMaxPWMScore( onestrandsindexvector, len(thispwm[0])), 
            
            print '  writing wiggle for score per start index.', datetime.now()
            vegardswritewiggle(onestrandsindexvector[0,], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/'+ thispwmname)
            
            print '  writing wiggle for bestscore. ', datetime.now()
            vegardswritewiggle(bestscorevector[0], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/'+thispwmname)
예제 #5
0
def getMOODSscore(seqfile, pwmfiles, both_strands=False):
    handle = open(seqfile, "r")
    records = list(Bio.SeqIO.parse(handle, "fasta"))
    handle.close()
    seq = records[0].seq
    print 'len(seq)=',len(seq)
    matrixlist=list()
    for f in pwmfiles:
        matrix = MOODS.load_matrix(f)
        print 'pwm ', f , 'windowlength=', len(matrix[0])
        matrixlist.append(matrix)
        if both_strands:
            matrixlist.append(MOODS.reverse_complement(matrix)) # both_strand option in MOODS returned a akward result.
    print 'starting MOODS.search', datetime.now()
    results = MOODS.search(seq, matrixlist, thresholds=1, absolute_threshold=False)
    print 'done MOODS.search', datetime.now()
    reslist=[]
    for n in range(len(pwmfiles)):
        thisind = n * (1 + both_strands)
        
        reslist.append(vegardparseMOODSres( results[thisind] , len(seq)))
        if both_strands:
            reslist[n] = np.append( reslist[n] , vegardparseMOODSres( results[thisind+1] , len(seq)), axis=0)
    return(reslist)
#datetime.now()
print 'running getMOODSscore', datetime.now()
indexscorematrix = getMOODSscore(fastafile,
                                 pwmfiles,
                                 both_strands=calculate_both_strands)
print 'finished getMOODSscore', datetime.now()
#datetime.now()

## for alle pwm.
## lage max array
## skrive ut 3 filer.
for n in range(len(pwmfiles)):
    thisname = pwmfiles[n].split('/')[-1]
    print 'making maxscpre for ', thisname, datetime.now()
    thisscorematrix = indexscorematrix[n]  #
    thispwmlength = len(MOODS.load_matrix(pwmfiles[n])[0])
    thismaxvector = getMaxPWMScore(thisscorematrix, thispwmlength)

    print 'writing wiggle for ', thisname, datetime.now()
    ### best score file
    vegardswritewiggle(thismaxvector,
                       name=thisname,
                       chr=seqname,
                       path=outputdir + '/' + 'best_score_in_window/' +
                       thisname)
    for strandnbr in range(len(thisscorematrix)):
        vegardswritewiggle(thisscorematrix[strandnbr, :],
                           name=thisname,
                           chr=seqname,
                           path=outputdir + '/' + 'start_index_score/' +
                           thisname + '/strand_' + str(strandnbr))
예제 #7
0
def ProcessCLI(args):
    
    outputDirectory = '/N/u/jubudka/Mason/BindingFiles/'
    weightMatrixDirectory = '/N/u/jubudka/Mason/PWMsmall/'
    sequencesFileName = 'FASTA_All_Merged_Encode.fasta'
    p_val = 0.0001

    print args
    for i in xrange(len(args)):
        if args[i] == "-f":
	    sequencesFileName = args[i+1]
	    print "Fasta file is: ", sequencesFileName
	elif args[i] == "-p":
	    weightMatrixDirectory = args[i+1]
	    print "PWM file is: ", weightMatrixDirectory
	elif args[i] == "-o":
	    outputDirectory = args[i+1]
	    print "Output file is: ", outputDirectory
	elif args[i] == "-t":
	    p_val = float(args[i+1])

    if not os.path.exists(outputDirectory):
	os.makedirs(outputDirectory)
	

    # file for saving average score stuff
    # load position weight matrices
    # order is A C G T
    sequences = {}
    seqIDs = []
    current_sequence = ''
    sequencesFile = open(sequencesFileName)

    aCount = 0
    cCount = 0
    gCount = 0
    tCount = 0
    totalLength = 0

    for lines in sequencesFile:
	line = lines.strip()
	if line == '':
	    continue
	if (line[0].startswith('>')):
	    seqIDs.append(line[1:])
	    #add previous sequence to dictionary
	    #create the reverse complement and add to dictionary
	    #perform nucleotide counting
	    #reset sequence to '' for next fasta sequence
	    if (len(current_sequence) > 0):
		upper_current_sequence = current_sequence.upper()
		seqID = seqIDs.pop(0)
		sequences[seqID + ' ' + 'p'] = upper_current_sequence
		reverseSequence = reverse_complement(upper_current_sequence)
		sequences[seqID + ' ' + 'm'] = reverseSequence
		aCount = aCount + upper_current_sequence.count('A')
                cCount = cCount + upper_current_sequence.count('C')
                gCount = gCount + upper_current_sequence.count('G')
                tCount = tCount + upper_current_sequence.count('T')
                totalLength = totalLength + len(current_sequence)

	    current_sequence = ''
	else:
	    current_sequence += line

    upper_current_sequence = current_sequence.upper()
    seqID = seqIDs.pop(0)
    sequences[seqID + ' ' + 'p'] = upper_current_sequence
    reverseSequence = reverse_complement(upper_current_sequence)
    sequences[seqID + ' ' + 'm'] = reverseSequence

    aCount = aCount + upper_current_sequence.count('A')
    cCount = cCount + upper_current_sequence.count('C')
    gCount = gCount + upper_current_sequence.count('G')
    tCount = tCount + upper_current_sequence.count('T')
    totalLength = totalLength + len(current_sequence)

    aContent = aCount/float(totalLength)
    cContent = cCount/float(totalLength)
    gContent = gCount/float(totalLength)
    tContent = tCount/float(totalLength)
  
    backgroundScores = {'A':aContent, 'C':cContent, 'G':gContent, 'T':tContent}
    bg = [backgroundScores['A'], backgroundScores['C'], backgroundScores['G'], backgroundScores['T']]
    print bg

    matrix_names = [filename for filename in os.listdir(weightMatrixDirectory) if filename[-4:] == '.pfm']
    pseudocount = 1

    matrices = [MOODS.load_matrix(weightMatrixDirectory + filename) for filename in matrix_names]

    matrices = [MOODS.count_log_odds(matrix, bg, pseudocount) for matrix in matrices]

    thresholds = [MOODS.threshold_from_p(matrix, bg, p_val) for matrix in matrices]


    for (matrix, matrix_name, threshold) in zip(matrices, matrix_names, thresholds):
	    motifLength = len(matrix[0])
	    if motifLength >= 18:
		matrix_mapper_long(matrix, matrix_name, threshold, outputDirectory, sequences)
		continue
	    else:
		matrix_mapper(matrix, matrix_name, threshold, outputDirectory, sequences)

    print "Finished"		
예제 #8
0
####### running MOODS algorithm on sequence with all pwm files.
#datetime.now()
print 'running getMOODSscore', datetime.now()
indexscorematrix = getMOODSscore(fastafile, pwmfiles, both_strands=calculate_both_strands)
print 'finished getMOODSscore', datetime.now()
#datetime.now()

## for alle pwm.
## lage max array
## skrive ut 3 filer.
for n in range(len(pwmfiles)):
    thisname = pwmfiles[n].split('/')[-1]
    print 'making maxscpre for ',thisname, datetime.now()
    thisscorematrix = indexscorematrix[n] #
    thispwmlength = len(MOODS.load_matrix(pwmfiles[n])[0])
    thismaxvector = getMaxPWMScore(thisscorematrix, thispwmlength)
    
    print 'writing wiggle for ',thisname, datetime.now()
    ### best score file
    vegardswritewiggle(thismaxvector, name=thisname, chr=seqname, path=outputdir + '/' + 'best_score_in_window/'+thisname)
    for strandnbr in range(len(thisscorematrix)):
        vegardswritewiggle(thisscorematrix[strandnbr,:], name=thisname, chr=seqname, path=outputdir + '/' + 'start_index_score/'+ thisname+ '/strand_'+str(strandnbr))
    



temp1 = getMaxPWMScore(temp1, thispwmlength)