Esempio n. 1
0
def findEnrichedAnnotationFreqs_motifLs(fg_ls, bg_ls, annotations, motifs, fasta_file, permutations):
    """  Check to see if the given motifs are enriched in the fg,
         as compared to the bg.  Return motif {}, each entry
         is [#of proteins with motif, |fg|, # of times bg
         permutation is as good as fg].  Do X background
         permutations.

    @param fg_ls: {} of foreground genes
    @param bg_ls: {} of backgound genes
    @param annotations: protein 2 annotation; 
    @param motifs: {} of motifs you are checking for enrichment
    @return: [#of proteins with motif, |fg|, # of times bg is as good as fg] for each motif
    """
    fasta = utils_fasta.loadFASTA(fasta_file)
    scores = scoreProteinLsFreqs_motifLs(fg_ls, annotations, motifs, fasta)
    better_counts = {}
    for motif in motifs.keys():
        better_counts[motif] = 0
    for i in xrange(permutations):
        bck_scores = scoreBckgndFreqs_motifLs(len(fg_ls),  bg_ls, 
                                              annotations, motifs, fasta)
        for motif in motifs.keys():
            if bck_scores[motif] >= scores[motif]:
                better_counts[motif] += 1
    for motif in motifs.keys():
        scores[motif] = [ scores[motif], len(fg_ls), better_counts[motif] ]
    return scores
Esempio n. 2
0
        else:
            tempSeq = ''
            for s in seq:
                tempSeq = tempSeq + s
            offset = 0
            while match:
                printResult(protein, elm, match, tempSeq, offset)
                tempSeq = tempSeq[int(match.start())+1:]
                offset += int( match.start() ) + 1
                match = p.search(tempSeq)

req_args = ['pattern file',
            'fasta file']
examples = ['../../Data/ELM/elm2pattern',
            '../../Data/FASTA/Human/hprd.intr.fasta']
utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True)

input_pattern_file = sys.argv[1]
elm2pattern = {}
f = open(input_pattern_file)
for line in f.xreadlines():
    [elm, pattern] = map(string.strip, line.split())
    elm2pattern[elm] = [re.compile(pattern), pattern]
f.close()

fasta = utils_fasta.loadFASTA(sys.argv[2])
for protein in fasta.keys():
     matchSeq(protein, fasta[protein], elm2pattern)


Esempio n. 3
0
import markov_chain, utils_fasta

chain = markov_chain.MarkovChain()
f = utils_fasta.loadFASTA('/home/perry/bioperry/Projects/Thesis/Data/FASTA/Human/hprd_new.intr.fasta')
for k in f:
    chain.add(f[k])
for i in range(10):
    print "".join(chain.random_output())
Esempio n. 4
0
vp_suffix = sys.argv[2]
protein_ls_output_file = sys.argv[3]
new2old_file = sys.argv[4]
old2new_file = sys.argv[5]

def clean(seq):
    """ Remove dashes and stars. """

    return seq.replace('-', '').replace('*', '').lstrip().strip()

protein_ls = {}
new2old = {}
old2new = {}

old_fasta = utils_fasta.loadFASTA(sys.argv[1])
for protein in old_fasta.keys():
    new_seq = clean(old_fasta[protein])
    name = protein
    if name.split('.')[-1] != vp_suffix:
        name = name + '.' + vp_suffix
    utils_fasta.prettyPrint(name, new_seq)
    protein_ls[name] = True
    new2old[name] = {}
    old2new[name] = {}
    new_index = 0
    for old_index in xrange(len(old_fasta[protein])):
        old_residue = old_fasta[protein][old_index]
        if old_residue  == '-' or old_residue == '*':
            pass
        else:
Esempio n. 5
0
import utils_fasta, sys, utils, global_settings
from collections import defaultdict

aa = 0
for g in global_settings.GENOMES:
    fasta = utils_fasta.loadFASTA('data/' + g + '.fa')
    for p in fasta:
        aa += len(fasta[p])

elm2count = defaultdict(utils.init_zero)
for g in global_settings.GENOMES:
    with open('results/elmdict_' + g + '.txt') as f:
        for line in f:
            [elm, seq, count_st, freq] = line.strip().split('\t')
            elm2count[elm] += int(count_st)

with open('results/all_elm_aa_freq', 'w') as f:
    for elm in elm2count:
        v = float(elm2count[elm])/float(aa)
        f.write(elm + '\t' + str(v) + '\n')