# for elmseq in seq_percents_bird: # if elmseq in uniq: # print elmseq + '\t' + str(sum(seq_percents_bird[elmseq])/float(total_bird)) + '\t' + str(sum(seq_percents_human[elmseq])/float(total_human)) human_host_file = 'working/Jun29/elmdict_H_sapiens.simple' chicken_host_file = 'working/Jun29/elmdict_Gallus_gallus.simple' human_host_freqs = get_host_freqs(human_host_file) chicken_host_freqs = get_host_freqs(chicken_host_file) with open('working/hi', 'w') as hi: with open('working/low', 'w') as low: for elmseq in uniq: if 'X' not in elmseq and 'J' not in elmseq and 'B' not in elmseq: per = sp[elmseq] dels = elmseq.split(':') new_seq = utils.mk_sub(dels[2]) key = dels[1] + ':' + new_seq human_host = 0 chicken_host = 0 if key in chicken_host_freqs: chicken_host = chicken_host_freqs[key] if key in human_host_freqs: human_host = human_host_freqs[key] bird_flu_frac = sum(seq_percents_bird[elmseq])/float(total_bird) human_flu_frac = sum(seq_percents_human[elmseq])/float(total_human) if bird_flu_frac >= float(80): hi.write(str(chicken_host) + '\n') else: low.write(str(chicken_host) + '\n')
"""Convert raw sequences to simplified versions for use in scanning with simple patterns (those made with utils.mk_sub)""" import sys, global_settings, os, utils in_file = sys.argv[1] out_file = sys.argv[2] with open(out_file, 'w') as f: for ID, seq in utils.fasta_iter(in_file): f.write('>' + ID + '\n') f.write(utils.mk_sub(seq) + '\n')
"""There is little overlap between host and flu sequences. To solve this problem, I'm substiting some residues with symbols for their properties""" import sys, global_settings, utils from collections import defaultdict elmdict_file = sys.argv[1] elm2seq2count = defaultdict(utils.init_zero) elm2count = defaultdict(utils.init_zero) with open(elmdict_file) as f: for line in f: (elm, seq, count, frq) = line.strip().split('\t') if 'X' not in seq: new_seq = utils.mk_sub(seq) elm2seq2count[elm+':'+new_seq] += int(count) elm2count[elm] += int(count) for elmSeq in elm2seq2count: elm, seq = elmSeq.split(':') count = elm2seq2count[elmSeq] freq = float(count)/float(elm2count[elm]) print('%s\t%s\t%d\t%.10f' % (elm, seq, count, freq))
"""Substitute residues for properties in flu ELMs file""" import sys, global_settings, utils from collections import defaultdict with open(sys.argv[1]) as f: for line in f: (protein, st, stp, elm, seq, ELM) = line.strip().split('\t') if 'X' not in seq and 'B' not in seq and 'Z' not in seq and 'J' not in seq: print('\t'.join((protein, st, stp, elm, utils.mk_sub(seq), ELM)))