예제 #1
0
def MOODS_search(seq, motif, thresholds=0):
    """an equivalent to Motif.search_pwm()"""
    if not USE_MOODS:
        raise RuntimeError("MOODS could not be imported")
    sequence = seq
    matrix_ = MOODS.transpose([map(lambda x: x[1], sorted(x.items())) for x in motif.log_odds()])
    # Note: algorithm = 'lf' fails due to segmentation fault
    results_per_matrix = MOODS.search(
        sequence,
        [matrix_],
        thresholds,
        bg=None,
        algorithm="pla",
        q=7,
        absolute_threshold=True,
        both_strands=True,
        combine=True,
    )

    # format as Motif.search_pwm results
    search_results = results_per_matrix[0]
    # figure out direction of reverse results
    # do we need to reverse it?
    results_sorted_like_Bio_Motif = sorted(
        search_results, key=operator.itemgetter(0), cmp=lambda x, y: cmp(abs(x), abs(y))
    )
    return results_sorted_like_Bio_Motif
예제 #2
0
def getMOODSscore_old(seq, mat):
    results = MOODS.search(seq, [mat], thresholds=1, absolute_threshold=False)
    resarray = np.zeros((1, len(seq)), dtype=np.dtype('Float32'))
    resarray[:, :] = -100
    for (position, score) in results[0]:
        resarray[0, position] = score
    return (resarray)
def getMOODSscore(seqfile, pwmfiles, both_strands=False):
    handle = open(seqfile, "r")
    records = list(Bio.SeqIO.parse(handle, "fasta"))
    handle.close()
    seq = records[0].seq
    print 'len(seq)=', len(seq)
    matrixlist = list()
    for f in pwmfiles:
        matrix = MOODS.load_matrix(f)
        print 'pwm ', f, 'windowlength=', len(matrix[0])
        matrixlist.append(matrix)
        if both_strands:
            matrixlist.append(
                MOODS.reverse_complement(matrix)
            )  # both_strand option in MOODS returned a akward result.
    print 'starting MOODS.search', datetime.now()
    results = MOODS.search(seq,
                           matrixlist,
                           thresholds=1,
                           absolute_threshold=False)
    print 'done MOODS.search', datetime.now()
    reslist = []
    for n in range(len(pwmfiles)):
        thisind = n * (1 + both_strands)

        reslist.append(vegardparseMOODSres(results[thisind], len(seq)))
        if both_strands:
            reslist[n] = np.append(reslist[n],
                                   vegardparseMOODSres(results[thisind + 1],
                                                       len(seq)),
                                   axis=0)
    return (reslist)
예제 #4
0
def getPWMscores(regions, PWM, fasta, regionsFile):
    """
    Score every basepairof a set of regions with a match to a PWM.

    :param regions: `bedtools.BedTool` with regions to look at.
    :type regions: bedtools.BedTool
    :param PWM:
    :type PWM:
    :param fasta: Fasta file with genome sequence.
    :type fasta: str
    :param regionsFile:
    :type regionsFile:
    """
    import MOODS
    import numpy as np
    import pandas as pd

    # Get nucleotides
    seq = regions.sequence(s=True, fi=fasta)
    with open(seq.seqfn) as handle:
        seqs = handle.read().split("\n")[1::2]  # get fasta sequences

    # Match strings with PWM
    scores = list()
    for sequence in seqs:
        result = MOODS.search(sequence, [PWM], 30)
        scores.append(np.array([j for i, j in result[0]]))

    names = open(regionsFile).read().split("\n")
    names.pop(-1)
    names = [line.split("\t")[3] for line in names]

    return pd.DataFrame(scores, index=names, columns=range(-990, 990))
예제 #5
0
def getMOODSscore_old(seq, mat):
    results = MOODS.search(seq, [mat], thresholds=1, absolute_threshold=False)
    resarray = np.zeros( (1, len(seq)), dtype=np.dtype('Float32'))
    resarray[:,:] = -100
    for (position, score) in results[0]:
        resarray[0, position] = score
    return(resarray)
예제 #6
0
def getPWMscores(regions, PWM, fasta, regionsFile):
    """
    Score every basepairof a set of regions with a match to a PWM.

    :param regions: `bedtools.BedTool` with regions to look at.
    :type regions: bedtools.BedTool
    :param PWM:
    :type PWM:
    :param fasta: Fasta file with genome sequence.
    :type fasta: str
    :param regionsFile:
    :type regionsFile:
    """
    import MOODS
    import numpy as np
    import pandas as pd

    # Get nucleotides
    seq = regions.sequence(s=True, fi=fasta)
    with open(seq.seqfn) as handle:
        seqs = handle.read().split("\n")[1::2]  # get fasta sequences

    # Match strings with PWM
    scores = list()
    for sequence in seqs:
        result = MOODS.search(sequence, [PWM], 30)
        scores.append(np.array([j for i, j in result[0]]))

    names = open(regionsFile).read().split("\n")
    names.pop(-1)
    names = [line.split("\t")[3] for line in names]

    return pd.DataFrame(scores, index=names, columns=range(-990, 990))
예제 #7
0
def search(consensus_list,TF,master,search_region,p):
	count = 0
	count1 = 0
	
	interaction_only = False
	try:
		if sys.argv[3] == "True":
			interaction_only = True
		else:
			interaction_only = False
	except IndexError:
		pass
	duplicate = []
	
	threshold = []
	header = ''
	for i in master:
		count +=1
		threshold += [p]
		print >> sys.stderr, count

	#print threshold
	
	for region in search_region:
		if 'strand' in region:
			continue
		if '>' in region:
			header = region
		else:
			for i in range(len(master)):
	
				tf = TF[i]
				#print tf,consensus_list[i]
				tf_length = len(consensus_list[i])
				#result = MOODS.search(region,master,threshold,absolute_threshold=threshold)
				result = MOODS.search(region,master,p)
				for check in range(len(result)):
					for j in range(len(result[check])):
						position = result[check][j][0]
						tf_length = len(consensus_list[check])
						
						if result[check][j] == []:
							continue
						if interaction_only:
							if [TF[check],header] not in duplicate:
								duplicate += [[TF[check],header]]
								print tf.strip(),header.strip()[1:]
								print ''
						else:
							if [TF[check],header,position] not in duplicate:
								duplicate+= [[TF[check],header,position]]
								print TF[check].strip(),header.strip()[1:]
								print 'position:',position
								align(consensus_list[check],region[position:position+tf_length])
								#print consensus_list[i],'Matched the motif in the upstream region:',region[position:position+tf_length]
								print ''
예제 #8
0
def ProcessSeqs(SEQ_HANDLE, PWMS, THRESHOLD, WANT_REV=False, bg=None):
    """Yields matches on sequences in an 'interval' formatted dictionary"""

    pwm_names = map(lambda x: x[0], PWMS)
    pwm_mats = map(lambda x: x[1], PWMS)
    thresh = map(lambda x: MOODS.threshold_from_p(x, bg, THRESHOLD), pwm_mats)

    for interval in ReadInterval(SEQ_HANDLE):
        print interval['NAME']

        results = MOODS.search(interval['SEQ'].upper(),
                               pwm_mats,
                               thresh,
                               both_strands=WANT_REV,
                               algorithm='lf',
                               absolute_threshold=True,
                               bg=bg)

        for res, pwm_name, pwm_mat, th in zip(results, pwm_names, pwm_mats,
                                              thresh):
            width = len(pwm_mat[0])
            for position, score in res:
                if score > th:
                    yield {
                        'NAME': interval['NAME'],
                        'START': int(interval['START']) + position,
                        'END': int(interval['START']) + width + position,
                        'STRAND': interval['STRAND'],
                        'PWM': pwm_name,
                        'SCORE': score,
                        'CHROM': interval['CHROM'],
                        'SEQ':
                        interval['SEQ'][position:(position + width)].upper()
                    }
                else:
                    print 'got bad result'
예제 #9
0
def getMOODSscore(seqfile, pwmfiles, both_strands=False):
    handle = open(seqfile, "r")
    records = list(Bio.SeqIO.parse(handle, "fasta"))
    handle.close()
    seq = records[0].seq
    print 'len(seq)=',len(seq)
    matrixlist=list()
    for f in pwmfiles:
        matrix = MOODS.load_matrix(f)
        print 'pwm ', f , 'windowlength=', len(matrix[0])
        matrixlist.append(matrix)
        if both_strands:
            matrixlist.append(MOODS.reverse_complement(matrix)) # both_strand option in MOODS returned a akward result.
    print 'starting MOODS.search', datetime.now()
    results = MOODS.search(seq, matrixlist, thresholds=1, absolute_threshold=False)
    print 'done MOODS.search', datetime.now()
    reslist=[]
    for n in range(len(pwmfiles)):
        thisind = n * (1 + both_strands)
        
        reslist.append(vegardparseMOODSres( results[thisind] , len(seq)))
        if both_strands:
            reslist[n] = np.append( reslist[n] , vegardparseMOODSres( results[thisind+1] , len(seq)), axis=0)
    return(reslist)
예제 #10
0
파일: example3.py 프로젝트: Wyss/MOODS
records = fasta.parseFasta(fasta_filepath)

seq = records[0][1]

matrix1 = [     [0,1,0,0,0,0,0,1,1,0],
                [1,0,0,0,0,0,0,0,0,0],
                [0,0,0,0,0,0,0,0,0,0],
                [0,0,1,1,1,1,1,0,0,1]
            ]
matrix2 = [     [10,0,10,3,5,5],
                [0,5,0,3,5,0,5],
                [0,1,0,3,0,5,0],
                [0,4,0,1,0,0,5]
            ]

results = MOODS.search(seq, [matrix1, matrix2], 0.011)

print("Matrix 1 results: "+ str(len(results[0])))
print("Matrix 2 results: "+ str(len(results[1])))


matrices = [matrix1, matrix2]
thresholds = [0.011, 0.011]
bg = MOODS.bg_from_sequence(seq, 0.1)
q = 7
absolute_threshold = False
both_strands=False
ms = MOODS.MOODSSearch(matrices, thresholds, bg, q, absolute_threshold, both_strands)
results = ms.search(seq)

print("New Matrix 1 results: "+ str(len(results[0])))
예제 #11
0
파일: find_motifs.py 프로젝트: xguse/gfunc
def main():
    """
    The main loop.  Lets ROCK!
    """
    
    desc = """... ask me later! I'm on a deadline! ..."""
    
    parser = argparse.ArgumentParser(description=desc)
    
    parser.add_argument('--seqs', type=str,
                        help="""Path to a fasta file containing the 'promoter'
                        regions of a single species you wish to scan with motifs.""")
    
    parser.add_argument('--species', type=str,
                        help="""A quoted string of the species name: 'Anophles gambiae'.""")
    
    parser.add_argument('--motifs', type=str,
                        help="""Path to a file containing the motifs you wish to use.  
                        The file must be in JASPAR's 'matrix_only.txt' format.""")
    
    parser.add_argument('--thresh', type=float, required=False, default=0.001,
                        help="""A p-val cut-off above which hits will be ignored. (default = %(default)s)""")
    
    parser.add_argument('--out', type=str, required=False, default='compare_motifs.out',
                        help="""Path to outfile. (default = %(default)s)""")
    
    parser.add_argument('--norm', type=str, required=False, default=False,
                        help="""Optional path to outfile for data normalized by upper quartiles w.r.t. each motif. (default = %(default)s)""")

    parser.add_argument('--to-norm', type=str, required=False, default=False,
                            help="""Optional path to outfile of previous run that needs to be normalized. (default = %(default)s)""")
    

    
    args = parser.parse_args()
    
    
    if not args.to_norm:
        # create parsers
        motifs = ParseJasparMatrixOnly(args.motifs)
        seqs   = ParseFastA(args.seqs)
    
        # Load all motifs at once
        # We will be loading one seq at a time.
        motifs = motifs.to_dict()
    
        # set up output and headers
        headers  = 'seq_name\tspecies\t%s\n' % ('\t'.join(motifs.keys()))
        out_file = open(args.out,'w')
        out_file.write(headers)
    
    # lets start the major looping
    if args.norm and not args.to_norm:
        norm_dict = OrderedDict()
    elif args.to_norm:
        norm_dict,headers = load_to_normalize(args.to_norm)
        write_normalized_table(headers,args.norm,norm_dict)
        exit(0)
        
    for name,seq in seqs:
        hits = MOODS.search(seq,motifs.values(),args.thresh)
        counts = [len(x) for x in hits]
        out_file.write('%s\t%s\t%s\n' % (name,args.species,'\t'.join([str(x) for x in counts])))
        if args.norm:
            norm_dict['%s\t%s' % (name,args.species)] = np.array(counts)
    out_file.close()
    
    if args.norm:
        write_normalized_table(headers,args.norm,norm_dict)
예제 #12
0
           [0,0,0,1]]

teststring = 'acgtacgt'
'''

handle = open('/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.fasta', "r")
records = list(Bio.SeqIO.parse(handle, "fasta"))
handle.close()
seq = records[0]
teststring=seq.seq




print('both strands')
results = MOODS.search(teststring, [matrix], thresholds=1, absolute_threshold=False, both_strands=True)

for i in results:
    for (position, score) in i:
        print("Position: " + str(position) + " Score: "+ str(score))
        
        
        
print('one way')
results = MOODS.search(teststring, [matrix], thresholds=1, absolute_threshold=False, both_strands=False)

for i in results:
    for (position, score) in i:
        print("Position: " + str(position) + " Score: "+ str(score))
        
        
예제 #13
0
import MOODS

matrix = [ [10,0,0],
           [0,10,0],
           [0,0,10],
           [10,10,10]]

results = MOODS.search('actgtggcgtcaacgtaggccaacgtggacccgtacgtaaacgaagaggggtagtc', [matrix], 30, absolute_threshold=30)

for i in results:
    for (position, score) in i:
        print("Position: " + str(position) + " Score: "+ str(score))
예제 #14
0
def find_motif_disruptions(
    position,
    ref,
    alt,
    genome_fasta,
    matrices,
):
    """
    Determine whether there is a difference between the ref and alt
    alleles for TF binding. Requires samtools in your path.
    
    Parameters
    ----------
    position : str
        Zero based genomic coordinates of the reference allele of the form
        chrom:start-end (chr5:100-101 for a SNV for instance). The value end -
        start should equal the length of the ref allele.

    ref : str
        Reference allele. This should match the reference sequence at "position"
        in genome_fasta.

    alt : str
        Alternate allele.

    genome_fasta : str
        Path to genome fasta file. This file should be indexed.
    
    matrices : dict
        Dict whose keys are motif names and whose values are pandas data frames 
        or numpy arrays containing PWMs with columns ACGT.

    Returns
    -------
    out : pandas.DataFrame
        Pandas data frame with motifs whose best matches that overlapped the
        variant differed between the reference and alternate sequences. A score
        of zero and a strand of '' indicates that there was not a match for the
        motif on the given allele.

    """
    import subprocess
    import MOODS
    # import pybedtools as pbt
    max_motif_length = max([x.shape[0] for x in matrices.values()])
    chrom, coords = position.split(':')
    start, end = [int(x) for x in coords.split('-')]
    s = '{}:{}-{}'.format(chrom, start - max_motif_length + 1,
                          end + max_motif_length - 1)
    c = 'samtools faidx {} {}'.format(genome_fasta, s)
    seq_lines = subprocess.check_output(c, shell=True).strip().split()
    ref_seq = seq_lines[1]
    alt_seq = ref_seq[0:max_motif_length -
                      1] + alt + ref_seq[max_motif_length + len(ref) - 1:]

    ref_variant_start = max_motif_length - 1
    ref_variant_end = max_motif_length - 1 + len(ref)
    alt_variant_start = max_motif_length - 1
    alt_variant_end = max_motif_length - 1 + len(alt)

    ms = [matrices[x].T.values.tolist() for x in matrices.keys()]
    ref_res = MOODS.search(ref_seq,
                           ms,
                           0.001,
                           both_strands=True,
                           bg=[0.25, 0.25, 0.25, 0.25])
    ref_res = dict(zip(matrices.keys(), ref_res))
    alt_res = MOODS.search(alt_seq,
                           ms,
                           0.001,
                           both_strands=True,
                           bg=[0.25, 0.25, 0.25, 0.25])
    alt_res = dict(zip(matrices.keys(), alt_res))

    # First we'll remove any motif matches that don't overlap the variant of interest (and thus
    # can't be affected by the variant and will be the same for ref and alt). Then we'll get the
    # best match for each motif for ref and alt.
    rows = []
    for motif in ref_res.keys():
        ref_res[motif] = _filter_variant_motif_res(ref_res[motif],
                                                   ref_variant_start,
                                                   ref_variant_end,
                                                   matrices[motif].shape[0],
                                                   ref_seq)
        alt_res[motif] = _filter_variant_motif_res(alt_res[motif],
                                                   alt_variant_start,
                                                   alt_variant_end,
                                                   matrices[motif].shape[0],
                                                   alt_seq)

        if len(ref_res[motif]) > 0:
            ref_pos, ref_score = sorted(ref_res[motif],
                                        key=lambda x: x[1],
                                        reverse=True)[0]
            ref_strand = {True: '+', False: '-'}[ref_pos > 0]
        else:
            ref_score = 0
            ref_strand = ''
        if len(alt_res[motif]) > 0:
            alt_pos, alt_score = sorted(alt_res[motif],
                                        key=lambda x: x[1],
                                        reverse=True)[0]
            alt_strand = {True: '+', False: '-'}[alt_pos > 0]
        else:
            alt_score = 0
            alt_strand = ''
        if ref_score > 0 or alt_score > 0:
            diff = ref_score - alt_score
            rows.append(
                [motif, ref_score, ref_strand, alt_score, alt_strand, diff])
    out = pd.DataFrame(rows,
                       columns=[
                           'motif', 'ref_score', 'ref_strand', 'alt_score',
                           'alt_strand', 'score_diff'
                       ])
    out.index = out.motif
    out = out.drop('motif', axis=1)
    out = out[out.score_diff != 0]
    return out
예제 #15
0
def count(enhlist, c, p_val):
	write_match = False
	index = 0
	seq = ""
	while index <= len(enhlist) - 1 and not enhlist[index].startswith('>'):
		seq += enhlist[index]
		seq = seq.replace('\n','')
		index += 1
	index = 0
	results = []
	pseudocount = 0.001
#	starrmot15
#	caca = [[49,0,100,0,91,0,100,0],[1,83,0,88,0,49,0,94],[49,1,0,12,0,9,0,6],[0,16,0,0,9,41,0,0,]]
	caca = [[0,1,0,1,0,1],[1,0,1,0,1,0],[0,0,0,0,0,0],[0,0,0,0,0,0]]
	me137 = [[0,1,0,1,0,1],[0,0,0,0,0,0],[1,0,1,0,1,0],[0,0,0,0,0,0]]
	gcgc = [[0,0,0,0,0,0],[0,1,0,1,0,1],[1,0,1,0,1,0],[0,0,0,0,0,0]]
	tata = [[0,1,0,1,0,1],[0,0,0,0,0,0],[0,0,0,0,0,0],[1,0,1,0,1,0]]
	matrices = [me137,caca,gcgc,tata]

	if (c == "default"):
		results = MOODS.search(seq,matrices,p_val,both_strands=True)

	elif (c == "human"):
		BG_hum =[0.29508855202553025, 0.20466109233964447, 0.20478482916547036, 0.2954655264693549]
	  	matrices = [MOODS.count_log_odds(matrix,BG_hum,pseudocount) for matrix in matrices]
	  	thresholds = [MOODS.threshold_from_p(matrix, BG_hum, p_val) for matrix in matrices]
	  	results = MOODS.search(seq, matrices, thresholds, convert_log_odds=False, threshold_from_p=False,both_strands=True)

	elif (c == "equal"):
		BG_eq = [0.25,0.25,0.25,0.25]
	  	matrices = [MOODS.count_log_odds(matrix,BG_eq,pseudocount) for matrix in matrices]
	  	thresholds = [MOODS.threshold_from_p(matrix, BG_eq, p_val) for matrix in matrices]
	  	results = MOODS.search(seq, matrices, thresholds, convert_log_odds=False, threshold_from_p=False,both_strands=True)

	else:
		write_match = False
		ga = re.compile(r'GAGA(?:GA)+',re.IGNORECASE)
		ct = re.compile(r'TCTC(?:TC)+',re.IGNORECASE)
		ca = re.compile(r'CACA(?:CA)+',re.IGNORECASE)
		gt = re.compile(r'TGTG(?:TG)+',re.IGNORECASE)
		gc = re.compile(r'GCGC(?:GC)+',re.IGNORECASE)
		cg = re.compile(r'CGCG(?:CG)+',re.IGNORECASE)
		ta = re.compile(r'TATA(?:TA)+',re.IGNORECASE)
		at = re.compile(r'ATAT(?:AT)+',re.IGNORECASE)

		ga_occur = re.findall(ga,seq) + re.findall(ct,seq)
		ca_occur = re.findall(ca,seq) + re.findall(gt,seq)
		gc_occur = re.findall(gc,seq) + re.findall(cg,seq)
		ta_occur = re.findall(ta,seq) + re.findall(at,seq)
		results = [ga_occur, ca_occur, gc_occur, ta_occur]
#	write files containing the matches for each motif as found by MOODS
#   only if boolean set at top of count() is True
	if write_match:
		ga = open('%sga_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a')
		ca = open('%sca_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a')
		gc = open('%sgc_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a')
		ta = open('%sta_matches_%d.txt' % (time.strftime("%Y-%m-%d"),1/p_val), 'a')
		comp = False
		for match in results[0]:
			pos = match[0]
			st = ""
			if (pos < 0):
				comp = True
				pos += len(seq)
			for i in range(len(me137[1])):
				st += seq[pos+i]
			if comp:
				st = complement(st)
				comp = False
			ga.write('%s\n' % st)

		for match in results[1]:
			pos = match[0]
			st = ""
			if (pos < 0):
				comp = True
				pos += len(seq)
			for i in range(len(caca[1])):
				st += seq[pos+i]
			if comp:
				st = complement(st)
				comp = False
			ca.write('%s\n' % st)

		for match in results[2]:
			pos = match[0]
			st = ""
			if (pos < 0):
				comp = True
				pos += len(seq)
			for i in range(len(gcgc[1])):
				st += seq[pos+i]
			if comp:
				st = complement(st)
				comp = False
			gc.write('%s\n' % st)

		for match in results[3]:
			pos = match[0]
			st = ""
			if (pos < 0):
				comp = True
				pos += len(seq)

			for i in range(len(tata[1])):
				st += seq[pos+i]
			if comp:
				st = complement(st)
				comp = False
			ta.write('%s\n' % st)

		ga.close()
		ca.close()
		gc.close()
		ta.close()
	results.append(len(seq))
	return results
예제 #16
0
def match_single(motif,
                 sequence,
                 genomic_region,
                 unique_threshold=None,
                 normalize_bitscore=True,
                 sort=False):
    """
    Performs motif matching given sequence and the motif.pssm passed as parameter.
    The genomic_region is needed to evaluate the correct binding position.
    Please note that the arguments should be passed as a list, to allow for parallelization
    mapping function.

    Keyword arguments:
    motif -- TODO.
    sequence -- A DNA sequence (string).
    genomic_region -- A GenomicRegion.
    output_file -- TODO.  
    unique_threshold -- If this argument is provided, the motif search will be made using a threshold of 0 and
                        then accepting only the motif matches with bitscore/motif_length >= unique_threshold.
        
    Return:
    Print MPBSs to output_file.
    """

    # Establishing threshold
    if unique_threshold:
        current_threshold = 0.0
        eval_threshold = unique_threshold
        motif_max = motif.max / motif.len
    else:
        current_threshold = motif.threshold
        eval_threshold = motif.threshold
        motif_max = motif.max

    # Performing motif matching
    try:
        # old MOODS version
        results = MOODS.search(sequence, [motif.pssm_list],
                               current_threshold,
                               absolute_threshold=True,
                               both_strands=True)
    except:
        # TODO: we can expand this to use bg from sequence, for example,
        # or from organism.
        bg = MOODS.tools.flat_bg(4)
        results = MOODS.scan.scan_dna(sequence, [motif.pssm_list], bg,
                                      [current_threshold], 7)

    grs = GenomicRegionSet("mpbs")

    for search_result in results:
        for r in search_result:
            try:
                position = r.pos
                score = r.score
            except:
                (position, score) = r

            # Verifying unique threshold acceptance
            if unique_threshold and score / motif.len < unique_threshold:
                continue

            # If match forward strand
            if position >= 0:
                p1 = genomic_region.initial + position
                strand = "+"
            # If match reverse strand
            elif not motif.is_palindrome:
                p1 = genomic_region.initial - position
                strand = "-"
            else:
                continue

            # Evaluating p2
            p2 = p1 + motif.len

            # Evaluating score (integer between 0 and 1000 -- needed for bigbed transformation)
            if normalize_bitscore:
                # Normalized bitscore = standardize to integer between 0 and 1000 (needed for bigbed transformation)
                if motif_max > eval_threshold:
                    norm_score = int(((score - eval_threshold) * 1000.0) /
                                     (motif_max - eval_threshold))
                else:
                    norm_score = 1000
            else:
                # Keep the original bitscore
                if unique_threshold:
                    norm_score = score / motif.len
                else:
                    norm_score = score

            grs.add(
                GenomicRegion(genomic_region.chrom,
                              int(p1),
                              int(p2),
                              name=motif.name,
                              orientation=strand,
                              data=str(norm_score)))

    if sort:
        grs.sort()

    return grs
예제 #17
0
import MOODS

matrix = [[10, 0, 0], [0, 10, 0], [0, 0, 10], [10, 10, 10]]

results = MOODS.search(
    'actgtggcgtcaacgtaggccaacgtggacccgtacgtaaacgaagaggggtagtc', [matrix],
    30,
    absolute_threshold=30)

for i in results:
    for (position, score) in i:
        print("Position: " + str(position) + " Score: " + str(score))
예제 #18
0
def main():
    p = optparse.OptionParser(__doc__)
    p.add_option('-t',
                 '--thresh',
                 action='store',
                 dest='threshold',
                 default=0.0,
                 help='determines threshold')
    p.add_option('-a',
                 '--append',
                 action='store',
                 dest='name',
                 default='resultsfor',
                 help='appends pwm name to this when\
                 creating files')
    p.add_option('-A',
                 '--absolute',
                 action='store_true',
                 dest='A',
                 default=False,
                 help='absolute threshold')
    p.add_option('-s',
                 '--standard_background',
                 action='store_true',
                 dest='stdbg')
    p.add_option('-M', '--specific_Matrix', action='store', dest='specific')
    options, args = p.parse_args()

    pwm = open(args[0], 'rU')
    fa = open(args[1], 'rU')
    pfa = list(Bio.SeqIO.parse(fa, 'fasta'))
    index, matricies, sizes = pySeq.parsing.PWMparser.parse(pwm)

    underorequal20 = []
    over20 = []
    under20names = []
    over20names = []
    pwmdata = {}
    fileout = {}
    bgt = False
    if options.stdbg:
        bgt = [0.25, 0.25, 0.25, 0.25]

    # Construct Matrices to search and files to write to.
    for k in index.keys():
        if options.specific:
            if k == options.specific:
                filename = options.name + k + '.bed'
                fileout[k] = open(filename, 'w')
                if sizes[k] <= 20:
                    underorequal20.append(matricies[k])
                    under20names.append(k)
                else:
                    over20.append(matricies[k])
                    over20names.append(k)
            else:
                filename = options.name + k + '.bed'
                fileout[k] = open(filename, 'w')
                if sizes[k] <= 20:
                    underorequal20.append(matricies[k])
                    under20names.append(k)
                else:
                    over20.append(matricies[k])
                    over20names.append(k)

    for chrom in pfa:
        print(chrom.name)
        #Run under 20s
        # Should we sort the results as all downstream applications require a
        # sort first
        res = MOODS.search(chrom.seq,
                           underorequal20,
                           float(options.threshold),
                           absolute_threshold=options.A,
                           both_strands=True,
                           bg=bgt,
                           algorithm='lf')

        for n, r in enumerate(res):
            for position, score in r:
                start, end, strand = strand_adjust(position,
                                                   sizes[under20names[n]])
                # Add option to round the score values.  Defaulting to int atm
                # since bedToBigBed only accepts integer values....
                fileout[under20names[n]].write('\t'.join([
                    chrom.name,
                    str(start),
                    str(end), under20names[n],
                    str(int(score * 100)), strand, '\n'
                ]))

        #Run over 20s
        res = MOODS.search(chrom.seq,
                           over20,
                           float(options.threshold),
                           absolute_threshold=options.A,
                           both_strands=True,
                           bg=bgt,
                           algorithm='supera')

        for n, r in enumerate(res):
            for position, score in r:
                start, end, strand = strand_adjust(position,
                                                   sizes[over20names[n]])
                fileout[over20names[n]].write('\t'.join([
                    chrom.name,
                    str(start),
                    str(end), over20names[n],
                    str(int(score * 100)), strand, '\n'
                ]))
예제 #19
0
import fasta

DIST_DIR = abspath(dirname(dirname(LOCAL_DIR)))
print(DIST_DIR)
fasta_filepath = join(DIST_DIR, "examples/data/sequence/dnaACGT.txt")
records = fasta.parseFasta(fasta_filepath)

seq = records[0][1]

matrix1 = [[0, 1, 0, 0, 0, 0, 0, 1, 1, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 1, 0, 0, 1]]
matrix2 = [[10, 0, 10, 3, 5, 5], [0, 5, 0, 3, 5, 0, 5], [0, 1, 0, 3, 0, 5, 0],
           [0, 4, 0, 1, 0, 0, 5]]

results = MOODS.search(seq, [matrix1, matrix2], 0.011)

print("Matrix 1 results: " + str(len(results[0])))
print("Matrix 2 results: " + str(len(results[1])))

matrices = [matrix1, matrix2]
thresholds = [0.011, 0.011]
bg = MOODS.bg_from_sequence(seq, 0.1)
q = 7
absolute_threshold = False
both_strands = False
ms = MOODS.MOODSSearch(matrices, thresholds, bg, q, absolute_threshold,
                       both_strands)
results = ms.search(seq)

print("New Matrix 1 results: " + str(len(results[0])))
예제 #20
0
def main():
    p = optparse.OptionParser(__doc__)
    p.add_option('-t', '--thresh', action='store', dest='threshold',
                 default=0.0,help='determines threshold')
    p.add_option('-a', '--append', action='store', dest='name',
                 default='resultsfor', help='appends pwm name to this when\
                 creating files')
    p.add_option('-A', '--absolute', action='store_true',dest='A',
                 default=False,help='absolute threshold')
    p.add_option('-s','--standard_background',action='store_true',dest='stdbg')
    p.add_option('-M', '--specific_Matrix', action='store', dest='specific')
    options, args = p.parse_args()

    pwm = open(args[0], 'rU')
    fa = open(args[1], 'rU')
    pfa = list(Bio.SeqIO.parse(fa, 'fasta'))
    index, matricies, sizes = pySeq.parsing.PWMparser.parse(pwm)

    underorequal20 = []
    over20 = []
    under20names = []
    over20names = []
    pwmdata={}
    fileout = {}
    bgt = False
    if options.stdbg:
        bgt = [0.25,0.25,0.25,0.25]

    # Construct Matrices to search and files to write to.
    for k in index.keys():
        if options.specific:
            if k == options.specific:
                filename = options.name + k + '.bed'
                fileout[k] = open(filename, 'w') 
                if sizes[k] <= 20:
                    underorequal20.append(matricies[k])
                    under20names.append(k)
                else:
                    over20.append(matricies[k])
                    over20names.append(k)
            else:
                filename = options.name + k + '.bed'
                fileout[k] = open(filename, 'w')
                if sizes[k] <= 20:
                    underorequal20.append(matricies[k])
                    under20names.append(k)
                else:
                    over20.append(matricies[k])
                    over20names.append(k)

    for chrom in pfa:
        print(chrom.name)
        #Run under 20s
        # Should we sort the results as all downstream applications require a
        # sort first
        res = MOODS.search(chrom.seq, underorequal20, float(options.threshold),
                           absolute_threshold=options.A , both_strands = True,
                           bg=bgt, algorithm='lf')

        for n,r in enumerate(res):
            for position,score in r:
                start, end, strand = strand_adjust(position,
                                                   sizes[under20names[n]])
                # Add option to round the score values.  Defaulting to int atm
                # since bedToBigBed only accepts integer values....
                fileout[under20names[n]].write('\t'.join([chrom.name,
                                                          str(start), str(end),
                                                          under20names[n],
                                                          str(int(score*100)), strand,
                                                          '\n']))

        #Run over 20s
        res = MOODS.search(chrom.seq, over20, float(options.threshold),
                           absolute_threshold=options.A , both_strands = True,
                           bg=bgt, algorithm='supera')


        for n,r in enumerate(res):
            for position,score in r:
                start, end, strand = strand_adjust(position, 
                                                   sizes[over20names[n]])
                fileout[over20names[n]].write('\t'.join([chrom.name, str(start),
                                                         str(end),over20names[n],
                                                         str(int(score*100)),strand,
                                                         '\n']))
예제 #21
0
def find_motif_disruptions(
    position, 
    ref, 
    alt, 
    genome_fasta, 
    matrices,
):
    """
    Determine whether there is a difference between the ref and alt
    alleles for TF binding. Requires samtools in your path.
    
    Parameters
    ----------
    position : str
        Zero based genomic coordinates of the reference allele of the form
        chrom:start-end (chr5:100-101 for a SNV for instance). The value end -
        start should equal the length of the ref allele.

    ref : str
        Reference allele. This should match the reference sequence at "position"
        in genome_fasta.

    alt : str
        Alternate allele.

    genome_fasta : str
        Path to genome fasta file. This file should be indexed.
    
    matrices : dict
        Dict whose keys are motif names and whose values are pandas data frames 
        or numpy arrays containing PWMs with columns ACGT.

    Returns
    -------
    out : pandas.DataFrame
        Pandas data frame with motifs whose best matches that overlapped the
        variant differed between the reference and alternate sequences. A score
        of zero and a strand of '' indicates that there was not a match for the
        motif on the given allele.

    """
    import subprocess
    import MOODS
    # import pybedtools as pbt
    max_motif_length = max([x.shape[0] for x in matrices.values()])
    chrom, coords = position.split(':')
    start,end = [int(x) for x in coords.split('-')]
    s = '{}:{}-{}'.format(chrom, start - max_motif_length + 1, end +
                          max_motif_length - 1)
    c = 'samtools faidx {} {}'.format(genome_fasta, s)
    seq_lines = subprocess.check_output(c, shell=True).strip().split()
    ref_seq = seq_lines[1]
    alt_seq = ref_seq[0:max_motif_length - 1] + alt + ref_seq[max_motif_length + len(ref) - 1:]

    ref_variant_start = max_motif_length - 1
    ref_variant_end = max_motif_length - 1 + len(ref)
    alt_variant_start = max_motif_length - 1
    alt_variant_end = max_motif_length - 1 + len(alt)

    ms = [matrices[x].T.values.tolist() for x in matrices.keys()]
    ref_res = MOODS.search(ref_seq, ms, 0.001, both_strands=True, 
                           bg=[0.25, 0.25, 0.25, 0.25])
    ref_res = dict(zip(matrices.keys(), ref_res))
    alt_res = MOODS.search(alt_seq, ms, 0.001, both_strands=True, 
                           bg=[0.25, 0.25, 0.25, 0.25])
    alt_res = dict(zip(matrices.keys(), alt_res))

    # First we'll remove any motif matches that don't overlap the variant of interest (and thus
    # can't be affected by the variant and will be the same for ref and alt). Then we'll get the 
    # best match for each motif for ref and alt.
    rows = []
    for motif in ref_res.keys():
        ref_res[motif] = _filter_variant_motif_res(ref_res[motif], ref_variant_start, ref_variant_end, 
                                           matrices[motif].shape[0], ref_seq)
        alt_res[motif] = _filter_variant_motif_res(alt_res[motif], alt_variant_start, alt_variant_end, 
                                           matrices[motif].shape[0], alt_seq)

        if len(ref_res[motif]) > 0:
            ref_pos, ref_score = sorted(ref_res[motif], key=lambda x: x[1], reverse=True)[0]
            ref_strand = {True:'+', False:'-'}[ref_pos > 0]
        else:
            ref_score = 0
            ref_strand = ''
        if len(alt_res[motif]) > 0:
            alt_pos, alt_score = sorted(alt_res[motif], key=lambda x: x[1], reverse=True)[0]
            alt_strand = {True:'+', False:'-'}[alt_pos > 0]
        else:
            alt_score = 0
            alt_strand = ''
        if ref_score > 0 or alt_score > 0:
            diff = ref_score - alt_score
            rows.append([motif, ref_score, ref_strand, alt_score, alt_strand, diff])
    out = pd.DataFrame(rows, columns=['motif', 'ref_score', 'ref_strand', 'alt_score', 
                                      'alt_strand', 'score_diff'])
    out.index = out.motif
    out = out.drop('motif', axis=1)
    out = out[out.score_diff != 0]
    return out