''' Generate a list of all pairwise comparisons of the exact matches ''' import sys import re from phage import Phage phage = Phage() try: f = sys.argv[1] except: sys.exit("Exact match file, probably phage.kmers.bacteria.rc.txt") bg = phage.completeBacteriaIDs() pg = phage.phageIDs() matches={} for p in pg: matches[p]={} for b in bg: matches[p][b] = 0 with open(f, 'r') as fin: for l in fin: p=l.strip().split("\t") m=re.findall('NC_\d+', l) if len(m) != 2:
''' Calculate the coverage of each genome pair from the blastx results Starting with the blastx converted to NC/NC ids, we want to calculate the coverage at each position in every genome. We will consider all the genomes with the most number of bases in the phage as the top genomes ''' import sys from phage import Phage phage=Phage() try: f=sys.argv[1] except: sys.exit(sys.argv[0] + " <blast output file converted to NC/NC format. Probably phage.genomes.blastx") count={} lens=phage.phageSequenceLengths() bctG = set(phage.completeBacteriaIDs()) phgG = set(phage.phageIDs()) for p in phgG: count[p]={} sys.stderr.write("Reading " + f + "\n")
''' Calculate the coverage of each genome pair from the blastx results Starting with the blastx converted to NC/NC ids, we want to calculate the coverage at each position in every genome. We will consider all the genomes with the most number of bases in the phage as the top genomes ''' import sys from phage import Phage phage = Phage() try: f = sys.argv[1] except: sys.exit( sys.argv[0] + " <blast output file converted to NC/NC format. Probably phage.genomes.blastx" ) count = {} lens = phage.phageSequenceLengths() bctG = set(phage.completeBacteriaIDs()) phgG = set(phage.phageIDs()) for p in phgG: count[p] = {}
'''Figure out which phages we do not have blastn hits for''' import os,sys from phage import Phage import re # get a list of all phages phage=Phage() phages=phage.phageTaxonomyString() try: blastf=sys.argv[1] except: sys.exit(sys.argv[0] + " <blast file>") found={} with open(blastf, 'r') as fin: for l in fin: p=l.split("\t") m=re.findall('(NC_\d+)', p[0]) found[m[0]]=1 for p in phages: if p not in found: print "MISSED " + p
return np.exp(-0.5 * (np.square(x - mu) / sigma_r)) # gaussian r: ~(39.7, 10) new_host = Host( c0=10**5, g_max=0.036, #lit: 0.012 yield_coeff=0.000000000001, half_sat=0.00000125, death_rate=0.001, t_dep=temperature_dependency_new_host, ) original_phage = Phage( c0=1000, adsorption_rate=0.000000000001, burst_size=100, death_rate=0.00272, ) new_phage = Phage( c0=1, adsorption_rate=0.0000001, burst_size=100, death_rate=0.00272, ) s0 = 0.0000025 #stock concentration of nutrient (g/mL)#0.0000025 R_pnn = 1 / 1000 # fraction of new phage in library # define system of differential equations """ def dXa_dt(X, t):
''' For blastn searches we are going to calculate the percent coverage of the phage genome and score the longest coverage as the best hit. It doesn't matter where the hits are on the bacterial genome. We are going to use a cutoff of 0.001 E value ''' import sys, os, re from phage import Phage phage = Phage() try: blastf = sys.argv[1] except: sys.exit(sys.argv[0] + "< blast file>") # read the fasta file of phages to get the lengths lens = phage.phageSequenceLengths() sys.stderr.write("Found " + str(len(lens)) + " sequences\n") hits = {i: {} for i in lens} with open(blastf, 'r') as fin: for l in fin: p = l.strip().split("\t") e = float(p[10]) if e > 0.001: continue m = re.findall('(NC_\d+)', p[0]) if m == []: sys.stderr.write("WARNING: No phage found in " + p[0] + "\n") continue
''' List complete bacteria or phage IDs. Set the bool to true to get bacteria ''' import sys from phage import Phage phage=Phage() try: bacteria = sys.argv[1] except: bacteria = False if bacteria: d = phage.completeBacteriaIDs() else: d = phage.phageIDs() for p in d: print(p)
def lyse(self): model.things = set([i for i in model.things if i != self]) for i in range(self._burst_size): model.add(Phage(self.get_location()[0], self.get_location()[1]))
#!/usr/bin/python '''Read the kmer counts from a series of files and plot a PCA''' ## start with reading the files and adding the kmer counts to the organism names import sys sys.path.append('/home3/redwards/bioinformatics/phage_host') from phage import Phage import re import os phage = Phage() ## we only choose those hosts with 5 phages that infect them. #host = phage.phageHost() host = phage.phageWithNHosts(5) try: dir = sys.argv[1] outf = sys.argv[2] except: sys.exit(sys.argv[0] + " <directory of kmer counts> <file to write output table to>") count={} allkmers={} organismId={} for file in os.listdir(dir): match = re.findall('NC_\d+', file) id = match[0] if id not in host: sys.stderr.write("Found a sequence with id " + id + " but we don't have enough genomes for it\n") continue
''' List complete bacteria or phage IDs. Set the bool to true to get bacteria ''' import sys from phage import Phage phage = Phage() try: bacteria = sys.argv[1] except: bacteria = False if bacteria: d = phage.completeBacteriaIDs() else: d = phage.phageIDs() for p in d: print(p)
'''Figure out which phages we do not have blastn hits for''' import os, sys from phage import Phage import re # get a list of all phages phage = Phage() phages = phage.phageTaxonomyString() try: blastf = sys.argv[1] except: sys.exit(sys.argv[0] + " <blast file>") found = {} with open(blastf, 'r') as fin: for l in fin: p = l.split("\t") m = re.findall('(NC_\d+)', p[0]) found[m[0]] = 1 for p in phages: if p not in found: print "MISSED " + p
''' Count the number of proteins in common between the phages and the bacteria based on blastx ''' import sys from phage import Phage phage=Phage() try: f=sys.argv[1] except: sys.exit(sys.argv[0] + " <blast output file converted to NC/NC format. Probably phage.genomes.blastx") count={} bctG = phage.completeBacteriaIDs() phgG = phage.phageIDs() for p in phgG: count[p]={} for b in bctG: count[p][b]=0 with open(f, 'r') as bin: for l in bin: p=l.strip().split("\t") if p[0] in count and p[1] in count[p[0]]: count[p[0]][p[1]] = count[p[0]].get(p[1], 0) + 1
''' Implement the Frac_Q and Frac_D methods from http://cge.cbs.dtu.dk/services/HostPhinder/. We start with a directory of text files, where each genome has a list of kmers that are found in that genome. Then we need to know the number shared. That phage then predicts the hos. ''' import sys,os from phage import Phage phage=Phage() phages=phage.phageTaxonomy() try: kmerD=sys.argv[1] outf=sys.argv[2] except: sys.exit(sys.argv[0] + " <15-mer host directory> <output file>") # read in all the 15mers kmer={} allk={} for p in phages: kmer[p]={} if not os.path.exists(os.path.join(kmerD, p + ".tsv")): sys.stderr.write("No kmer file for " + p + "\n") with open(os.path.join(kmerD, p + ".tsv"), 'r') as fin: for l in fin: part=l.split("\t") kmer[p][part[0]]=1
class reaB(): def __init__(self): # simulation time and resolution of samples self.dt = 0.1 self.xs = np.linspace(0, 120, 120 / self.dt) self.fluxA = 0.0 self.fluxB = 0.0 self.new_host = Host( c0=10**5, g_max=0.036, # lit: 0.012 yield_coeff=0.000000000001, half_sat=0.00000125, death_rate=0.001, t_dep=self.temperature_dependency_new_host, ) self.original_phage = Phage( c0=10**9, adsorption_rate=0.000000000001, burst_size=100, death_rate=0.00272, ) self.new_phage = Phage( c0=10**6, adsorption_rate=0.0000000001, burst_size=100, death_rate=0.00272, ) self.s0 = 0.0000025 # stock concentration of nutrient (g/mL) #0.0000025 self.R_pnn = 1 / 1000 # fraction of new phage in library self.myinterpolator = scipy.interpolate.interp1d( np.array([0 - 1, 0]), # X np.array([0, 1]).T, # Y kind="linear", bounds_error=False, fill_value=0) self.d = 13.3 # lysis time delay self.dd = 13 # dt *=10 self.q_h_inf_poo = deque([0]) self.q_h_inf_pnn = deque([0]) for i in range(100): self.q_h_inf_poo.append(0) self.q_h_inf_pnn.append(0) data = ddeint(self.model, self.initial_conditions, self.xs, fargs=(self.d, )) plt.figure(figsize=(16, 16)) plt.subplot(3, 3, 1) plt.plot(self.xs, [data[t][0] for t in range(len(self.xs))], label="reactor A") plt.xlabel('time [min]') plt.ylabel('concentration [#bacteria/mL]') plt.title('Host Concentration A over Time') plt.legend() plt.subplot(3, 3, 2) plt.plot(self.xs, [data[t][1] for t in range(len(self.xs))], label="reactor A") plt.xlabel('time [min]') plt.ylabel('concentration [g/mL]') plt.title('Nutrient A over Time') plt.legend() plt.subplot(3, 3, 3) plt.plot(self.xs, [data[t][2] for t in range(len(self.xs))]) plt.xlabel('time [min]') plt.ylabel('concentration [#bacteria/mL]') plt.title('Host Concentration B over Time') plt.subplot(3, 3, 4) plt.plot(self.xs, [data[t][3] for t in range(len(self.xs))]) plt.xlabel('time [min]') plt.ylabel('concentration [g/mL]') plt.title('Nutrient B over Time') plt.subplot(3, 3, 5) plt.plot(self.xs, [data[t][4] for t in range(len(self.xs))]) plt.xlabel('time [min]') plt.ylabel('concentration [#bacteria/mL]') plt.title('Host infected OO over Time') plt.subplot(3, 3, 6) plt.plot(self.xs, [data[t][5] for t in range(len(self.xs))]) plt.xlabel('time [min]') plt.ylabel('concentration [#bacteria/mL]') plt.title('Host infected NN over Time') plt.subplot(3, 3, 8) plt.plot(self.xs, [data[t][6] for t in range(len(self.xs))]) plt.xlabel('time [min]') plt.ylabel('concentration [#phage/mL]') plt.title('Phage OO over Time') plt.subplot(3, 3, 9) plt.plot(self.xs, [data[t][7] for t in range(len(self.xs))]) plt.xlabel('time [min]') plt.ylabel('concentration [#phage/mL]') plt.title('Phage NN over Time') plt.subplot(3, 3, 7) plt.plot(self.xs, [data[t][6] for t in range(len(self.xs))], label="original") plt.plot(self.xs, [data[t][7] for t in range(len(self.xs))], label="new") plt.xlabel('time [min]') plt.ylabel('concentration [#phage/mL]') plt.title('Phage over Time') plt.legend() plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.show() # Reactor A # lb influx profile of reactor A def in_a_lb(self, t): """ :param t: time t :return: lb influx to reactor a at time t """ return self.fluxA # biomass outflux profile of reactor A def out_a(self, t): """ :param t: time t :return: biomass outflux of reactor a at time t """ return self.fluxA # temperature profile of reactor A def temperature_a(self, t): """ :param t: time t :return: temperature at time t """ return 39.7 # Reactor B # lb influx profile of reactor B def in_b_lb(self, t): """ :param t: time t :return: lb influx to reactor b at time t """ return self.fluxB / 3 # lb influx profile of reactor B def in_nh(self, t): """ :param t: time t :return: new_host influx from reactor a to reactor b at time t """ return self.fluxB / 3 # phage library influx in reactor B def in_lib(self, t): """ param t: time t :return: library influx in reator b at time t """ return self.fluxB / 3 # biomass outflux profile of reactor B def out_b(self, t): """ :param t: time t :return: biomass outflux of reactor b at time t """ return self.fluxB # temperature profile of reactor B def temperature_b(self, t): """ :param t: time t :return: temperature at time t """ return 39.7 def temperature_dependency_new_host(self, x): """ :param x: temperature :return: growth rate factor """ mu = 39.7 sigma_l = 120.0 sigma_r = 10.0 if x < mu: return np.exp( -0.5 * (np.square(x - mu) / sigma_l)) # gaussian l: ~(39.7, 120) else: return np.exp( -0.5 * (np.square(x - mu) / sigma_r)) # gaussian r: ~(39.7, 10) def update(self, myinterpolator, t, Y): """ Add one new (ti,yi) to the interpolator """ Y2 = Y if (Y.size == 1) else np.array([Y]).T myinterpolator = scipy.interpolate.interp1d( np.hstack([myinterpolator.x, [t]]), # X np.hstack([myinterpolator.y, Y2]), # Y kind="linear", bounds_error=False, fill_value=Y) return myinterpolator # define system of differential equations def model(self, Y, t, d): c_host_a, c_nutr_a, c_host_b, c_nutr_b, c_inf_poo, c_inf_pnn, c_poo, c_pnn = Y( t) c_host_a_d, c_nutr_a_d, c_host_b_d, c_nutr_b_d, c_inf_poo_d, c_inf_pnn_d, c_poo_d, c_pnn_d = Y( t - d) y = self.myinterpolator(t) self.myinterpolator = self.update( self.myinterpolator, t, self.original_phage.infection_rate(c_host_b, c_poo) - y - self.out_b(t) * c_inf_poo) if y != 0: pass #print(y) return np.array([ 0 if c_host_a < 0 else self.new_host.per_cell_growth_rate( c_nutr_a, self.temperature_a(t)) * c_host_a - self.out_a(t) * c_host_a - self.new_host.death_rate * c_host_a, self.s0 * self.in_a_lb(t) if c_nutr_a < 0 else -self.new_host.yield_coeff * self.new_host.per_cell_growth_rate( c_nutr_a, self.temperature_a(t)) * c_host_a + self.s0 * self.in_a_lb(t) - c_nutr_a * self.out_a(t), 0 if c_host_b < 0 else self.new_host.per_cell_growth_rate( c_nutr_b, self.temperature_b(t)) * c_host_b + self.in_nh(t) * c_host_a - self.new_phage.infection_rate(c_host_b, c_pnn) - self.original_phage.infection_rate(c_host_b, c_poo) - self.new_host.death_rate * c_host_b - self.out_b(t) * c_host_b, self.s0 * self.in_b_lb(t) + c_nutr_a * self.in_nh(t) if c_nutr_b < 0 else -self.new_host.yield_coeff * self.new_host.per_cell_growth_rate( c_nutr_b, self.temperature_b(t)) * (c_host_b + c_inf_poo + c_inf_pnn) + self.s0 * self.in_b_lb(t) + c_nutr_a * self.in_nh(t) - c_nutr_b * self.out_b(t), self.original_phage.infection_rate(c_host_b, c_poo) if c_inf_poo < 0 else self.original_phage.infection_rate(c_host_b, c_poo) - y - self.out_b(t) * c_inf_poo, self.new_phage.infection_rate(c_host_b, c_pnn) if c_inf_pnn < 0 else self.new_phage.infection_rate(c_host_b, c_pnn) - y - self.out_b(t) * c_inf_pnn, 0 if c_poo < 0 else self.original_phage.burst_size * y - self.original_phage.infection_rate(c_host_b, c_poo) - self.out_b(t) * c_poo - self.original_phage.death_rate * c_poo + self.in_lib(t) * (1 - self.R_pnn), 0 if c_pnn < 0 else self.new_phage.burst_size * y - self.new_phage.infection_rate(c_host_b, c_pnn) - self.out_b(t) * c_pnn - self.new_phage.death_rate * c_pnn + self.in_lib(t) * self.R_pnn ]) def initial_conditions(self, t): return array([ self.new_host.c0, self.s0, 10**9, self.s0 * 5 * 10**3, 0.0, 0.0, self.original_phage.c0, self.new_phage.c0 ])
def test_phage_finder(phage_finder, input, refseq, name): result = Phage(input, phage_finder) assert refseq == result.refseq assert name == result.name
def __init__(self): # simulation time and resolution of samples self.dt = 0.1 self.xs = np.linspace(0, 120, 120 / self.dt) self.fluxA = 0.0 self.fluxB = 0.0 self.new_host = Host( c0=10**5, g_max=0.036, # lit: 0.012 yield_coeff=0.000000000001, half_sat=0.00000125, death_rate=0.001, t_dep=self.temperature_dependency_new_host, ) self.original_phage = Phage( c0=10**9, adsorption_rate=0.000000000001, burst_size=100, death_rate=0.00272, ) self.new_phage = Phage( c0=10**6, adsorption_rate=0.0000000001, burst_size=100, death_rate=0.00272, ) self.s0 = 0.0000025 # stock concentration of nutrient (g/mL) #0.0000025 self.R_pnn = 1 / 1000 # fraction of new phage in library self.myinterpolator = scipy.interpolate.interp1d( np.array([0 - 1, 0]), # X np.array([0, 1]).T, # Y kind="linear", bounds_error=False, fill_value=0) self.d = 13.3 # lysis time delay self.dd = 13 # dt *=10 self.q_h_inf_poo = deque([0]) self.q_h_inf_pnn = deque([0]) for i in range(100): self.q_h_inf_poo.append(0) self.q_h_inf_pnn.append(0) data = ddeint(self.model, self.initial_conditions, self.xs, fargs=(self.d, )) plt.figure(figsize=(16, 16)) plt.subplot(3, 3, 1) plt.plot(self.xs, [data[t][0] for t in range(len(self.xs))], label="reactor A") plt.xlabel('time [min]') plt.ylabel('concentration [#bacteria/mL]') plt.title('Host Concentration A over Time') plt.legend() plt.subplot(3, 3, 2) plt.plot(self.xs, [data[t][1] for t in range(len(self.xs))], label="reactor A") plt.xlabel('time [min]') plt.ylabel('concentration [g/mL]') plt.title('Nutrient A over Time') plt.legend() plt.subplot(3, 3, 3) plt.plot(self.xs, [data[t][2] for t in range(len(self.xs))]) plt.xlabel('time [min]') plt.ylabel('concentration [#bacteria/mL]') plt.title('Host Concentration B over Time') plt.subplot(3, 3, 4) plt.plot(self.xs, [data[t][3] for t in range(len(self.xs))]) plt.xlabel('time [min]') plt.ylabel('concentration [g/mL]') plt.title('Nutrient B over Time') plt.subplot(3, 3, 5) plt.plot(self.xs, [data[t][4] for t in range(len(self.xs))]) plt.xlabel('time [min]') plt.ylabel('concentration [#bacteria/mL]') plt.title('Host infected OO over Time') plt.subplot(3, 3, 6) plt.plot(self.xs, [data[t][5] for t in range(len(self.xs))]) plt.xlabel('time [min]') plt.ylabel('concentration [#bacteria/mL]') plt.title('Host infected NN over Time') plt.subplot(3, 3, 8) plt.plot(self.xs, [data[t][6] for t in range(len(self.xs))]) plt.xlabel('time [min]') plt.ylabel('concentration [#phage/mL]') plt.title('Phage OO over Time') plt.subplot(3, 3, 9) plt.plot(self.xs, [data[t][7] for t in range(len(self.xs))]) plt.xlabel('time [min]') plt.ylabel('concentration [#phage/mL]') plt.title('Phage NN over Time') plt.subplot(3, 3, 7) plt.plot(self.xs, [data[t][6] for t in range(len(self.xs))], label="original") plt.plot(self.xs, [data[t][7] for t in range(len(self.xs))], label="new") plt.xlabel('time [min]') plt.ylabel('concentration [#phage/mL]') plt.title('Phage over Time') plt.legend() plt.subplots_adjust(wspace=0.4, hspace=0.4) plt.show()
sys.path.append('/home3/redwards/bioinformatics/phage_host') sys.path.append('/home3/redwards/bioinformatics/Modules') from phage import Phage import re import os import taxon ''' Code to add all the phage hosts taxonomic heirarchy to the phage host files''' wanted = ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] manual = {'Acinetobacter genomosp.' : '471', 'Actinobacillus actinomycetemcomitans' : '714', 'alpha proteobacterium' : '34025', 'Bacillus clarkii' : '79879', 'Brevibacterium flavum' : '92706', 'Celeribacter sp.' : '875171', 'Escherichia sp.' : '237777', 'Geobacillus sp.' : '340407', 'Gordonia rubropertincta' : '36822', 'Iodobacter sp.' : '641420', 'Listeria sp.' : '592375', 'Marinomonas sp.' : '127794', 'Methanobacterium thermoautotrophicum' : '145262', 'methicillin-resistant Staphylococcus' : '1280', 'Nitrincola sp.' : '459834', 'Persicivirga sp.' : '859306', 'Salisaeta sp.' : '1392396', 'Sulfitobacter sp.' : '191468'} phage = Phage() host = phage.phageHost() taxa = taxon.readNodes() names,blastname,genbankname,synonym = taxon.extendedNames() divs = taxon.readDivisions() name2id = {names[x].name:x for x in names} name2id.update({blastname[x].name:x for x in blastname}) name2id.update({genbankname[x].name:x for x in genbankname}) name2id.update({synonym[x].name:x for x in synonym}) for id in host: if host[id] in manual: i = manual[host[id]]
#!/usr/bin/python '''Read the kmer counts from a series of files and plot a PCA''' ## start with reading the files and adding the kmer counts to the organism names import sys sys.path.append('/home3/redwards/bioinformatics/phage_host') from phage import Phage import re import os phage = Phage() ## we only choose those hosts with 5 phages that infect them. #host = phage.phageHost() bacteria = phage.completeBacteria() try: dir = sys.argv[1] outf = sys.argv[2] merWanted = sys.argv[3] except: sys.exit(sys.argv[0] + " <directory of kmer counts> <file to write output table to> <kmer size>") count={} allkmers={} organismId={} for file in os.listdir(dir): # our file names look kmers/165.kmers/NC_008025.1.3kmer.tsv match = re.match('(.*)\.\d+\.(\d+)kmer.tsv', file) if match ==None:
'Celeribacter sp.': '875171', 'Escherichia sp.': '237777', 'Geobacillus sp.': '340407', 'Gordonia rubropertincta': '36822', 'Iodobacter sp.': '641420', 'Listeria sp.': '592375', 'Marinomonas sp.': '127794', 'Methanobacterium thermoautotrophicum': '145262', 'methicillin-resistant Staphylococcus': '1280', 'Nitrincola sp.': '459834', 'Persicivirga sp.': '859306', 'Salisaeta sp.': '1392396', 'Sulfitobacter sp.': '191468' } phage = Phage() host = phage.phageHost() taxa = taxon.readNodes() names, blastname, genbankname, synonym = taxon.extendedNames() divs = taxon.readDivisions() name2id = {names[x].name: x for x in names} name2id.update({blastname[x].name: x for x in blastname}) name2id.update({genbankname[x].name: x for x in genbankname}) name2id.update({synonym[x].name: x for x in synonym}) for id in host: if host[id] in manual: i = manual[host[id]] else:
else: return np.exp(-0.5 * (np.square(x - mu) / sigma_r)) # gaussian r: ~(39.7, 10) new_host = Host( c0=10**5, g_max=0.036, #lit: 0.012 yield_coeff=0.000000000001, half_sat=0.00000125, death_rate=0.001, t_dep=temperature_dependency_new_host, ) original_phage = Phage( c0=10**9, adsorption_rate=0.000000000001, burst_size=100, death_rate=0.00272, ) new_phage = Phage( c0=10**6, adsorption_rate=0.0000000001, burst_size=100, death_rate=0.00272, ) s0 = 0.0000025 #stock concentration of nutrient (g/mL) #0.0000025 R_pnn = 1 / 1000 # fraction of new phage in library def c_nutr_b(tt, c_host_b, c_inf_poo, c_inf_pnn, c_nutr_a): return -new_host.yield_coeff * new_host.per_cell_growth_rate(
def get_phagename_and_refseq(row, phage_finder): phage = Phage(row, phage_finder) return phage.name, phage.refseq
''' Calculate the distance between two codon usages. We have two files, the first with just the phages and the second with their hosts. Then we need to calculate which of the hosts is closest ''' import os import sys sys.path.append('/home3/redwards/bioinformatics/Modules') import numpy as np import scipy from phage import Phage phage = Phage() bctG = set(phage.completeBacteriaIDs()) phgG = set(phage.phageIDs()) remove_ambiguous = True # do we want ambiguous bases or not codons = set([ 'AAA', 'AAC', 'AAG', 'AAT', 'ACA', 'ACC', 'ACG', 'ACT', 'AGA', 'AGC', 'AGG', 'AGT', 'ATA', 'ATC', 'ATG', 'ATT', 'CAA', 'CAC', 'CAG', 'CAT', 'CCA', 'CCC', 'CCG', 'CCT', 'CGA', 'CGC', 'CGG', 'CGT', 'CTA', 'CTC', 'CTG', 'CTT', 'GAA', 'GAC', 'GAG', 'GAT', 'GCA', 'GCC', 'GCG', 'GCT', 'GGA', 'GGC', 'GGG', 'GGT', 'GTA', 'GTC', 'GTG', 'GTT', 'TAA', 'TAC', 'TAG', 'TAT', 'TCA', 'TCC', 'TCG', 'TCT', 'TGA', 'TGC', 'TGG', 'TGT', 'TTA', 'TTC', 'TTG', 'TTT' ])
''' Generate a list of all pairwise comparisons of the exact matches ''' import sys import re from phage import Phage phage = Phage() try: f = sys.argv[1] except: sys.exit("Exact match file, probably phage.kmers.bacteria.rc.txt") bg = phage.completeBacteriaIDs() pg = phage.phageIDs() matches = {} for p in pg: matches[p] = {} for b in bg: matches[p][b] = 0 with open(f, 'r') as fin: for l in fin: p = l.strip().split("\t") m = re.findall('NC_\d+', l) if len(m) != 2: #sys.stderr.write("Error parsing two NC ids from " + l) continue
''' For blastn searches we are going to calculate the percent coverage of the phage genome and score the longest coverage as the best hit. It doesn't matter where the hits are on the bacterial genome. We are going to use a cutoff of 0.001 E value ''' import sys,os,re from phage import Phage phage=Phage() try: blastf=sys.argv[1] except: sys.exit(sys.argv[0] + "< blast file>") # read the fasta file of phages to get the lengths lens=phage.phageSequenceLengths() sys.stderr.write("Found " + str(len(lens)) + " sequences\n") hits={i:{} for i in lens} with open(blastf, 'r') as fin: for l in fin: p=l.strip().split("\t") e=float(p[10]) if e > 0.001: continue m=re.findall('(NC_\d+)', p[0]) if m == []: sys.stderr.write("WARNING: No phage found in " + p[0] + "\n")