def findEnrichedAnnotationFreqs_motifLs(fg_ls, bg_ls, annotations, motifs, fasta_file, permutations): """ Check to see if the given motifs are enriched in the fg, as compared to the bg. Return motif {}, each entry is [#of proteins with motif, |fg|, # of times bg permutation is as good as fg]. Do X background permutations. @param fg_ls: {} of foreground genes @param bg_ls: {} of backgound genes @param annotations: protein 2 annotation; @param motifs: {} of motifs you are checking for enrichment @return: [#of proteins with motif, |fg|, # of times bg is as good as fg] for each motif """ fasta = utils_fasta.loadFASTA(fasta_file) scores = scoreProteinLsFreqs_motifLs(fg_ls, annotations, motifs, fasta) better_counts = {} for motif in motifs.keys(): better_counts[motif] = 0 for i in xrange(permutations): bck_scores = scoreBckgndFreqs_motifLs(len(fg_ls), bg_ls, annotations, motifs, fasta) for motif in motifs.keys(): if bck_scores[motif] >= scores[motif]: better_counts[motif] += 1 for motif in motifs.keys(): scores[motif] = [ scores[motif], len(fg_ls), better_counts[motif] ] return scores
else: tempSeq = '' for s in seq: tempSeq = tempSeq + s offset = 0 while match: printResult(protein, elm, match, tempSeq, offset) tempSeq = tempSeq[int(match.start())+1:] offset += int( match.start() ) + 1 match = p.search(tempSeq) req_args = ['pattern file', 'fasta file'] examples = ['../../Data/ELM/elm2pattern', '../../Data/FASTA/Human/hprd.intr.fasta'] utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True) input_pattern_file = sys.argv[1] elm2pattern = {} f = open(input_pattern_file) for line in f.xreadlines(): [elm, pattern] = map(string.strip, line.split()) elm2pattern[elm] = [re.compile(pattern), pattern] f.close() fasta = utils_fasta.loadFASTA(sys.argv[2]) for protein in fasta.keys(): matchSeq(protein, fasta[protein], elm2pattern)
import markov_chain, utils_fasta chain = markov_chain.MarkovChain() f = utils_fasta.loadFASTA('/home/perry/bioperry/Projects/Thesis/Data/FASTA/Human/hprd_new.intr.fasta') for k in f: chain.add(f[k]) for i in range(10): print "".join(chain.random_output())
vp_suffix = sys.argv[2] protein_ls_output_file = sys.argv[3] new2old_file = sys.argv[4] old2new_file = sys.argv[5] def clean(seq): """ Remove dashes and stars. """ return seq.replace('-', '').replace('*', '').lstrip().strip() protein_ls = {} new2old = {} old2new = {} old_fasta = utils_fasta.loadFASTA(sys.argv[1]) for protein in old_fasta.keys(): new_seq = clean(old_fasta[protein]) name = protein if name.split('.')[-1] != vp_suffix: name = name + '.' + vp_suffix utils_fasta.prettyPrint(name, new_seq) protein_ls[name] = True new2old[name] = {} old2new[name] = {} new_index = 0 for old_index in xrange(len(old_fasta[protein])): old_residue = old_fasta[protein][old_index] if old_residue == '-' or old_residue == '*': pass else:
import utils_fasta, sys, utils, global_settings from collections import defaultdict aa = 0 for g in global_settings.GENOMES: fasta = utils_fasta.loadFASTA('data/' + g + '.fa') for p in fasta: aa += len(fasta[p]) elm2count = defaultdict(utils.init_zero) for g in global_settings.GENOMES: with open('results/elmdict_' + g + '.txt') as f: for line in f: [elm, seq, count_st, freq] = line.strip().split('\t') elm2count[elm] += int(count_st) with open('results/all_elm_aa_freq', 'w') as f: for elm in elm2count: v = float(elm2count[elm])/float(aa) f.write(elm + '\t' + str(v) + '\n')