Ejemplo n.º 1
0
def get_cons_elms(dir, hosts, years, strains, per, d, out_file, suffix):
    """Find ELMs that are consered at some per
       for all host/strain/year combinations w/
       at least 50 sequences"""

    d1 = {'ELM':True}
    d2 = d
    use_files = {}
    protein_counts_pass = defaultdict(dict)
    for host in hosts:
        for year in years:
            for strain in strains:
                f = os.path.join(dir, '.'.join((host, strain, str(year))) + '.elms')
                new_f = os.path.join(dir, '.'.join((host, strain, str(year))) + suffix + '.' + per)
                try:
                    count_cons(use_files, protein_counts_pass, f, d1, new_f)
                             #print host, year, strain
                except: pass
    for f in use_files:
        use_files[f] = utils_motif.protein2annotation(f, d2)
#    pass_elms = 

    with open(out_file, 'w') as afile:
         for protein in protein_counts_pass:
              #print protein + '\t' + str(len(protein_counts_pass[protein])) + '\t' + str([x.split('/')[2].split('.')[0:3] for x in protein_counts_pass[protein].keys()])
              elm_counts_local = defaultdict(init_zero)
              for f in protein_counts_pass[protein]:
                   for elm in use_files[f][protein]:
                        elm_counts_local[elm] += 1
              for elm in elm_counts_local:
                   if len(protein_counts_pass[protein]) == elm_counts_local[elm]:
                        afile.write(protein + '\t' + elm + '\n')
Ejemplo n.º 2
0
def predict_die(useELMs, domain_tools, netFile):
    h1 = {}
    h2 = {}
    net = utils_graph.getEdges(netFile)
    proteins = {}
    tool_d = {}
    tool_d['ELM'] = True
    protein2elm = utils_motif.protein2annotation('/home/perry/Projects/Human_Virus/Data/human.annotations', tool_d)
    getProteinsForELMs(useELMs, proteins)
    expandProteinsForELMs(useELMs, proteins, domain_tools)
    for g1 in proteins.keys():
        if net.has_key(g1):
            for elm in proteins[g1].keys():
                for g2 in protein2elm.keys():
                    if net[g1].has_key(g2) and protein2elm[g2].has_key(elm) and g1 != g2:
                        if not h1.has_key(elm): h1[elm] = {}
                        if not h2.has_key(elm): h2[elm] = {}
                        h1[elm][g1] = True
                        h2[elm][g2] = True
    return [h1, h2]
Ejemplo n.º 3
0
def main():
    req_args = ['virus annotation file',
                'annotation tool',
                '% MSA cutoff']
    examples = ['../../Data/ProfileScan/hiv.prosite',
                'ProfileScan',
                '90']
    utils_scripting.checkStart(sys.argv, req_args, examples, len(req_args), True)
    
    annotation_file = sys.argv[1]
    tool = sys.argv[2]
    conserved_cutoff = float(sys.argv[3])
    
    protein2annotation = utils_motif.protein2annotation(annotation_file,
                                                        {tool:True})
    [virus2annotation, virus2proteinCount] = getCounts(protein2annotation)

    for vp in virus2annotation.keys():
        for motif in virus2annotation[vp].keys():
            percent = (float(100) * float(virus2annotation[vp][motif]) / 
                       float(virus2proteinCount[vp]))
            if percent >= conserved_cutoff:
                print vp + '\t0\t0\t' + motif + '\tseq\t' + tool
            sys.stderr.write(vp + '\t' + motif + '\t' + str(percent) + '\n')
Ejemplo n.º 4
0
# del freq_elms['LIG_PDZ_3']
# del freq_elms['MOD_CK1_1']
# del freq_elms['MOD_CK2_1']
# del freq_elms['MOD_GSK3_1']
elm2freq = {}
for elm in freq_elms:
    elm2freq[elm] = {}
    for s in aa_freqs:
        if elm in aa_freqs[s][0]:
            elm2freq[elm][s] = aa_freqs[s][0][elm]
        else:
            elm2freq[elm][s] = float(0)

cut = sys.argv[2]
d = {'ELM':True}
swine_H1N1_elms = utils_motif.protein2annotation('results/swine.H1N1.elms.' + cut, d)
swine_H3N2_elms = utils_motif.protein2annotation('results/swine.H3N2.elms.' + cut, d)
swine = [swine_H1N1_elms, swine_H3N2_elms]

human_H1N1_elms = utils_motif.protein2annotation('results/human.H1N1.elms.' + cut, d)
human_H3N2_elms = utils_motif.protein2annotation('results/human.H3N2.elms.' + cut, d)
human_H5N1_elms = utils_motif.protein2annotation('results/human.H5N1.elms.' + cut, d)
human = [human_H1N1_elms, human_H3N2_elms, human_H5N1_elms]

chicken_H5N1_elms = utils_motif.protein2annotation('results/chicken.H5N1.elms.' + cut, d)
chicken_H9N2_elms = utils_motif.protein2annotation('results/chicken.H9N2.elms.' + cut, d)
chicken = [chicken_H5N1_elms, chicken_H9N2_elms]

duck_H5N1_elms = utils_motif.protein2annotation('results/duck.H5N1.elms.' + cut, d)
duck_H9N2_elms = utils_motif.protein2annotation('results/duck.H9N2.elms.' + cut, d)
duck = [duck_H5N1_elms, duck_H9N2_elms]
    for seq in seq_counts:
        if float(seq_counts[seq]) / protein_count > float(0.9):
            cons.append(seq)
        else:
            nonCons.append(seq)
    return [cons, nonCons]


host_file = sys.argv[1]
elm_file = sys.argv[2]
cons_file = sys.argv[3]
ofile = sys.argv[4]
cmp_file = sys.argv[5]
cmp_host_file = sys.argv[6]

elms = utils_motif.protein2annotation(elm_file, {"ELM": True})
cmp_elms = utils_motif.protein2annotation(cmp_file, {"ELM": True})
cons_elms = utils_motif.protein2annotation(cons_file, {"ELM": True})
host_freqs = get_freq(host_file)
cmp_freqs = get_freq(cmp_host_file)
(elm_counts, protein_counts) = getProtein2elm2seq(elms, cons_elms)
(cmp_counts, cmp_protein_counts) = getProtein2elm2seq(cmp_elms, cons_elms)

lines = ""
pos = 0
neg = 0
z = 0
for protein in elm_counts:
    if protein in cmp_counts:
        protein_count = float(protein_counts[protein])
        cmp_protein_count = float(cmp_protein_counts[protein])
Ejemplo n.º 6
0
    for elm in e:
        freq_elms[elm] = True

elm2freq = {}
for elm in freq_elms:
    elm2freq[elm] = {}
    for s in aa_freqs:
        if elm in aa_freqs[s][0]:
            elm2freq[elm][s] = aa_freqs[s][0][elm]
        else:
            elm2freq[elm][s] = float(0.0000000000000000000001)

cut = sys.argv[1]

d = {'ELM':True}
swine_H1N1_elms = utils_motif.protein2annotation('results/swine.H1N1.elms.' + cut, d)
swine_H3N2_elms = utils_motif.protein2annotation('results/swine.H3N2.elms.' + cut, d)
swine = [swine_H1N1_elms, swine_H3N2_elms]

human_H1N1_elms = utils_motif.protein2annotation('results/human.H1N1.elms.' + cut, d)
human_H3N2_elms = utils_motif.protein2annotation('results/human.H3N2.elms.' + cut, d)
human_H5N1_elms = utils_motif.protein2annotation('results/human.H5N1.elms.' + cut, d)
human = [human_H1N1_elms, human_H3N2_elms, human_H5N1_elms]

horse_H3N8_elms = utils_motif.protein2annotation('results/equine.H3N8.elms.' + cut, d)
horse = [horse_H3N8_elms]

chicken_H5N1_elms = utils_motif.protein2annotation('results/chicken.H5N1.elms.' + cut, d)
chicken_H9N2_elms = utils_motif.protein2annotation('results/chicken.H9N2.elms.' + cut, d)
chicken = [chicken_H5N1_elms, chicken_H9N2_elms]
Ejemplo n.º 7
0
""" I need to add HIV and HCV to the project,
    but I must first convert the ELMs hits to
    the frequencies used for this project.
"""
import utils_motif, sys

def printELMusage(elm, seq2count):
    total = 0
    for seq in seq2count:
        total += seq2count[seq]
    for seq in seq2count:
        print('%s\t%s\t%d\t%.10f' %
              (elm, seq, seq2count[seq],
               float(seq2count[seq]/float(total))))

protein2elm = utils_motif.protein2annotation(sys.argv[1],
                                             {'ELM':True})
elm2seq2count = {}
for p in protein2elm:
    for elm in protein2elm[p]:
        if elm not in elm2seq2count:
            elm2seq2count[elm] = {}
        for [st, stp, seq] in protein2elm[p][elm]:
            if seq not in elm2seq2count[elm]:
                elm2seq2count[elm][seq] = 0
            elm2seq2count[elm][seq] += 1
for elm in elm2seq2count:
    printELMusage(elm, elm2seq2count[elm])
Ejemplo n.º 8
0
""" Find ELMs that are not conserve by change.
    Enter subtype & cutoff for # trial an ELM for a 
    protein can be found by chance.
"""
import sys, utils_motif, utils_graph
import utils
from collections import defaultdict
elm_file = sys.argv[1]
cutoff = int(sys.argv[2])

real = utils_motif.protein2annotation('results/' + elm_file,
                                      {'ELM':True})

protein2elms = {}
for x in xrange(10):
    protein2annotation = utils_motif.protein2annotation('random_seq/'
                                                        + str(x) + '/'
                                                        + elm_file,
                                                        {'ELM':True})
    for protein in protein2annotation:
        for elm in protein2annotation[protein]:
            if not protein in protein2elms:
                protein2elms[protein] = {}
            if not elm in protein2elms[protein]:
                protein2elms[protein][elm] = 0
            protein2elms[protein][elm] += 1

for protein in real:
    if protein in protein2elms:
        for elm in real[protein]:
            if elm in protein2elms[protein]:
Ejemplo n.º 9
0
                    if not protein in protein2elm2seq:
                        protein2elm2seq[protein] = {}
                    if not elm in protein2elm2seq[protein]:
                        protein2elm2seq[protein][elm] = {}
                    for [st, stp, seq] in elms[protein_id][elm]:
                        if not seq in protein2elm2seq[protein][elm]:
                            protein2elm2seq[protein][elm][seq] = 0
                        protein2elm2seq[protein][elm][seq] += 1
    return (protein2elm2seq, proteinCounts)

host_file = sys.argv[1]
elm_file = sys.argv[2]
cons_file = sys.argv[3]
ofile = sys.argv[4]

elms = utils_motif.protein2annotation(elm_file,
                                      {'ELM':True})
cons_elms = utils_motif.protein2annotation(cons_file,
                                           {'ELM':True})
host_freqs = get_freq(host_file)
(elm_counts, protein_counts) = getProtein2elm2seq(elms, cons_elms)

lines = ''
for protein in elm_counts:
    protein_count = float(protein_counts[protein])
    for elm in elm_counts[protein]:
        if elm in host_freqs:
            virus_freqs = []
            non_virus_freqs = []
            found_seqs = {}
            for seq in elm_counts[protein][elm]:
                if float(elm_counts[protein][elm][seq])/protein_count > float(.9):
Ejemplo n.º 10
0
import utils_motif, utils_graph

use_elms = utils_graph.getNodes('use_elms')

human = utils_motif.protein2annotation('human.H1N1.elms',
                                       {'ELM':True})
human_conserved = utils_motif.protein2annotation('human.H1N1.elms.90',
                                                 {'ELM':True})
swine = utils_motif.protein2annotation('swine.H1N1.elms',
                                       {'ELM':True})
swine_conserved = utils_motif.protein2annotation('swine.H1N1.elms.90',
                                                 {'ELM':True})

def get_entropy(afile):
    entropy = {}
    with open(afile) as f:
        for line in f:
            [elm, entropy_st] = line.strip().split('\t')
            if not elm in entropy:
                entropy[elm] = {}
            entropy[elm] = float(entropy_st)
    return entropy

def get_best_seq(seqs):
    ls = []
    for seq in seqs:
        ls.append([seqs[seq],seq])
    ls.sort()
    #if len(ls) > 1:
    #    print ls[0], ls[1]
    
Ejemplo n.º 11
0
import utils_motif, sys

conserved_file = sys.argv[1]
elm_file = sys.argv[2]
conserved = utils_motif.protein2annotation(conserved_file,
                                           {'ELM':True})
elms_pre = utils_motif.protein2annotation(elm_file,
                                          {'ELM':True})
elms = {}
for protein in elms_pre:
    vp = protein.split('.')[-1]
    if not vp in elms:
        elms[vp] = {}
    for elm in elms_pre[protein]:
        if not elm in elms[vp]:
            elms[vp][elm] = []
        for tri in elms_pre[protein][elm]:
            elms[vp][elm].append(tri)

elm2seq2count = {}
for vp in conserved:
    for elm in conserved[vp]:
        for [st, stp, seq] in elms[vp][elm]:
            if not elm in elm2seq2count:
                elm2seq2count[elm] = {}
            if not seq in elm2seq2count[elm]:
                elm2seq2count[elm][seq] = 0
            elm2seq2count[elm][seq] += 1
for elm in elm2seq2count:
    total = 0
    for seq in elm2seq2count[elm]:
Ejemplo n.º 12
0
import utils_motif, sys

flu = sys.argv[1]
species = sys.argv[2]
strain = sys.argv[3]

human = utils_motif.protein2annotation('human.' + strain + '.elms',
                                       {'ELM':True})
human_conserved = utils_motif.protein2annotation('human.' + strain + '.elms.90',
                                                 {'ELM':True})
swine = utils_motif.protein2annotation(flu + '.' + strain + '.elms',
                                       {'ELM':True})
swine_conserved = utils_motif.protein2annotation(flu + '.' + strain + '.elms.90',
                                                 {'ELM':True})

def get_freq(afile):
    freq = {}
    with open(afile) as f:
        for line in f:
            #[elm_seq, freq_st] = line.strip().split('\t')
            #elm, seq = elm_seq.split(':')
            elm, seq, num, freq_st = line.strip().split('\t')
            if not elm in freq:
                freq[elm] = {}
            freq[elm][seq] = float(freq_st)
    return freq

def get_best_seq(seqs):
    ls = []
    for seq in seqs:
        ls.append([seqs[seq],seq])