def __init__(self, S, reaction_ids, compound_ids, fluxes=None, name=None): """Initialize the stoichiometric model. Args: S: the stoichiometrix matrix. Reactions are on the rows, compounds on the columns. reaction_ids: the ids/names of the reactions (rows). compound_ids: the ids/names of the compounds (columns). fluxes: the list of relative fluxes through all reactions. if not supplied, assumed to be 1.0 for all reactions. name: a string name for this model. """ self.kegg = Kegg.getInstance() self.S = S self.reaction_ids = reaction_ids self.compound_ids = compound_ids self.Nr = len(self.reaction_ids) self.Nc = len(self.compound_ids) self.name = name self.slug_name = util.slugify(self.name) self.fluxes = np.array(fluxes) if fluxes is None: self.fluxes = np.ones((1, self.Nr)) expected_Nc, expected_Nr = self.S.shape if self.Nr != expected_Nr: raise ValueError('Number of columns does not match number of reactions') if self.Nc != expected_Nc: raise ValueError('Number of rows does not match number of compounds') if self.fluxes is None: self.fluxes = np.ones((self.Nr, 1))
def __init__(self, label): self.label = label self.conditions_to_raw_culture_levels = {} self.conditions_to_raw_reporter_levels = {} self.conditions_to_raw_times = {} self.conditions_to_culture_levels = {} self.conditions_to_reporter_levels = {} self.conditions_to_times = {} self.conditions_to_activities = {} self.conditions_to_smooth_activities = {} self.slug_label = util.slugify(self.label) self.raw_levels_fname = '%s_raw_levels.png' % self.slug_label self.levels_fname = '%s_levels.png' % self.slug_label self.vs_bg_fname = '%s_vs_bg.png' % self.slug_label self.activity_fname = '%s_activity.png' % self.slug_label
def __init__( self, model, thermodynamic_data, metabolite_concentration_bounds, optimization_status=OptimizationStatus.Successful(), optimal_value=None, optimal_ln_metabolite_concentrations=None, ): self.model = model self.thermo = thermodynamic_data self.bounds = metabolite_concentration_bounds self.S = model.GetStoichiometricMatrix() self.Ncompounds, self.Nreactions = self.S.shape self.status = optimization_status self.opt_val = optimal_value self.ln_concentrations = optimal_ln_metabolite_concentrations self.dGr0_tag = np.array(thermodynamic_data.GetDGrTagZero_ForModel(self.model)) self.dGr0_tag_list = list(self.dGr0_tag.flatten()) self.compound_ids = self.model.GetCompoundIDs() self.reaction_ids = self.model.GetReactionIDs() self.fluxes = self.model.GetFluxes() self.slug_name = util.slugify(model.name) self.pathway_graph_filename = "%s_graph.svg" % self.slug_name self.thermo_profile_filename = "%s_thermo_profile.png" % self.slug_name self.conc_profile_filename = "%s_conc_profile.png" % self.slug_name self.kegg = Kegg.getInstance() self.concentrations = None self.dGr_tag = None self.dGr_tag_list = None self.dGr_bio = None self.dGr_bio_list = None if self.ln_concentrations is not None and self.dGr0_tag is not None: self.concentrations = np.exp(self.ln_concentrations) conc_correction = RT * self.ln_concentrations * self.S self.dGr_tag = np.array(self.dGr0_tag + conc_correction) self.dGr_tag_list = list(self.dGr_tag.flatten()) bio_concs = self.bounds.GetBoundsWithDefault(self.compound_ids, default=1e-3) bio_correction = RT * np.dot(np.log(bio_concs), self.S) self.dGr_bio = np.array(self.dGr0_tag + bio_correction) self.dGr_bio_list = list(self.dGr_bio.flatten())
def __init__(self, model, thermodynamic_data, metabolite_concentration_bounds, optimization_status=OptimizationStatus.Successful(), optimal_value=None, optimal_ln_metabolite_concentrations=None): self.model = model self.thermo = thermodynamic_data self.bounds = metabolite_concentration_bounds self.S = model.GetStoichiometricMatrix() self.Ncompounds, self.Nreactions = self.S.shape self.status = optimization_status self.opt_val = optimal_value self.ln_concentrations = optimal_ln_metabolite_concentrations self.dGr0_tag = np.array( thermodynamic_data.GetDGrTagZero_ForModel(self.model)) self.dGr0_tag_list = list(self.dGr0_tag.flatten()) self.compound_ids = self.model.GetCompoundIDs() self.reaction_ids = self.model.GetReactionIDs() self.fluxes = self.model.GetFluxes() self.slug_name = util.slugify(model.name) self.pathway_graph_filename = '%s_graph.svg' % self.slug_name self.thermo_profile_filename = '%s_thermo_profile.png' % self.slug_name self.conc_profile_filename = '%s_conc_profile.png' % self.slug_name self.kegg = Kegg.getInstance() self.concentrations = None self.dGr_tag = None self.dGr_tag_list = None self.dGr_bio = None self.dGr_bio_list = None if (self.ln_concentrations is not None and self.dGr0_tag is not None): self.concentrations = np.exp(self.ln_concentrations) conc_correction = RT * self.ln_concentrations * self.S self.dGr_tag = np.array(self.dGr0_tag + conc_correction) self.dGr_tag_list = list(self.dGr_tag.flatten()) bio_concs = self.bounds.GetBoundsWithDefault(self.compound_ids, default=1e-3) bio_correction = RT * np.dot(np.log(bio_concs), self.S) self.dGr_bio = np.array(self.dGr0_tag + bio_correction) self.dGr_bio_list = list(self.dGr_bio.flatten())
def MakePerPlateFigures(self, dirname): fnames = [] for condition, plates in self.plates.iteritems(): for plate in plates: labels = plate.filtered_labels order = np.argsort(labels) smooth_activity = plate.smooth_filtered_activities max_activity = plate.filtered_max_activities left_mat = np.diag(1/max_activity) scaled_activity = np.dot(left_mat, smooth_activity) scaled_culture = plate.scaled_culture_levels max_culture_idx = np.argmin(np.abs(scaled_culture - 1.0), 1) for i, j in enumerate(max_culture_idx): scaled_culture[i,j+1:] = 100 Nr, _ = scaled_activity.shape od_fracs = np.arange(0.0, 1.0, 0.001) activity_per_od_frac = np.zeros((Nr, len(od_fracs))) for j, frac in enumerate(od_fracs): abs_min = np.abs(scaled_culture - frac) idxs = np.argmin(abs_min, 1) for i in order: idx = idxs[i] activity_per_od_frac[i,j] = scaled_activity[i, idx] pylab.figure() pylab.title(condition) pylab.imshow(activity_per_od_frac, aspect='auto') condition_plate_name = util.slugify(condition) fname = '%s.png' % condition_plate_name pylab.savefig(path.join(dirname, fname), format='png') fnames.append(fname) return fnames
def MakePerPlateFigures(self, dirname): fnames = [] for condition, plates in self.plates.iteritems(): for plate in plates: labels = plate.filtered_labels order = np.argsort(labels) smooth_activity = plate.smooth_filtered_activities max_activity = plate.filtered_max_activities left_mat = np.diag(1 / max_activity) scaled_activity = np.dot(left_mat, smooth_activity) scaled_culture = plate.scaled_culture_levels max_culture_idx = np.argmin(np.abs(scaled_culture - 1.0), 1) for i, j in enumerate(max_culture_idx): scaled_culture[i, j + 1:] = 100 Nr, _ = scaled_activity.shape od_fracs = np.arange(0.0, 1.0, 0.001) activity_per_od_frac = np.zeros((Nr, len(od_fracs))) for j, frac in enumerate(od_fracs): abs_min = np.abs(scaled_culture - frac) idxs = np.argmin(abs_min, 1) for i in order: idx = idxs[i] activity_per_od_frac[i, j] = scaled_activity[i, idx] pylab.figure() pylab.title(condition) pylab.imshow(activity_per_od_frac, aspect='auto') condition_plate_name = util.slugify(condition) fname = '%s.png' % condition_plate_name pylab.savefig(path.join(dirname, fname), format='png') fnames.append(fname) return fnames
def Main(): options, _ = MakeOpts().parse_args(sys.argv) assert options.tree_filename and options.tree_format assert options.pathways_filename and options.genome_db_filename assert options.output_filename print 'Reading tree from', options.tree_filename tree = dendropy.Tree() tree.read_from_path(options.tree_filename, options.tree_format, extract_comment_metadata=True) leaves = tree.leaf_nodes() print 'Tree has %d leaf nodes' % len(leaves) pathways = pathway.LoadPathways(options.pathways_filename) db = genome_db.GenomeDB(options.genome_db_filename) pathway_names = [util.slugify(p.name) for p in pathways] org_2_pathways = {} for path in pathways: org_counts = Counter() for enz_set in path.enzyme_sets: orgs_w_enz = set() for ec in enz_set: orgs_w_enz.update(list(db.OrganismsForEC(ec))) org_counts += Counter(orgs_w_enz) orgs_w_pathway = [ o for o, c in org_counts.iteritems() if c == len(path.enzyme_sets) ] orgs_w_pathway = filter(None, map(db.KEGG2NCBI, orgs_w_pathway)) for org in orgs_w_pathway: org_2_pathways.setdefault(org, set()).add(util.slugify(path.name)) # Find the organisms that have pathway tags. all_labels = set([l.taxon.label for l in leaves]) pathway_orgs = set(org_2_pathways.keys()) intersect = all_labels.intersection(pathway_orgs) print len(intersect), 'pathway orgs found' print len(pathway_orgs) - len(intersect), 'pathway orgs not found' # Find organisms that are heterotrophs if options.only_heterotrophs: print 'Pruning non-heterotrophs' q = db.db.Execute( 'SELECT ncbi_taxon_id, energy_category from organisms') ncbi_to_keep = set() for row in q: ncbi_id, energy_cat = row if energy_cat and energy_cat.lower() == 'organic': ncbi_to_keep.add(ncbi_id.strip()) tree.retain_taxa_with_labels(ncbi_to_keep) leaves = tree.leaf_nodes() print 'Tree now contains', len(leaves), 'leaves' lengths = [] for e in tree.leaf_edge_iter(): lengths.append(e.length) lengths = pylab.array(lengths) below_thresh = pylab.find(lengths < options.edge_length_threshold).size pct_below = 100.0 * float(below_thresh) / float(len(lengths)) print 'Median length', pylab.median(lengths) print 'Mean length', pylab.mean(lengths) print pct_below, '% below threshold' print 'Pruning leaves' for l in tree.leaf_nodes(): label = l.taxon.label pathways = org_2_pathways.get(label, set()) l.pathways = pathways for pname in pathways: setattr(l, pname, True) l.annotate(pname) l.count = 1 l.annotate('count') while True: changed = False for e in tree.leaf_edge_iter(): if (e.tail_node is not None and e.length < options.edge_length_threshold): changed |= MaybeMergeChildren(e.tail_node) if not changed: break tree.write_to_path(options.output_filename, 'nexus', suppress_annotations=True) leaves = tree.leaf_nodes() print 'Tree now has %d leaf nodes' % len(leaves) colormap = { 'upper_emp_unique': '#008837', 'upper_ed_unique': '#7b3294', 'pts': '#868800', 'glucokinase': '#002288' } default_color = '#c0c3c7' name_2_count = {} path_vectors = {} for name in pathway_names: fname = util.slugify(name) + '.csv' f = open(fname, 'w') w = csv.writer(f) v = [] for leaf in leaves: taxon_id = leaf.taxon.label pathways = leaf.pathways d = leaf.annotations() value = d.get(name, (False, None))[0] color = default_color if value is True: color = colormap[name] name_2_count[name] = name_2_count.get(name, 0) + 1 v.append(1) else: v.append(0) w.writerow([taxon_id, color, value]) path_vectors[name] = pylab.array(v) f.close() pts_vector = path_vectors['pts'] glk_vector = path_vectors['glucokinase'] ed_vector = path_vectors['upper_ed_unique'] emp_vector = path_vectors['upper_emp_unique'] ed_only = np.logical_and(ed_vector, np.logical_not(emp_vector)) emp_only = np.logical_and(emp_vector, np.logical_not(ed_vector)) print 'ED, EMP' r, p_val = stats.pearsonr(ed_vector, emp_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'ED, PTS' r, p_val = stats.pearsonr(ed_vector, pts_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'EMP, PTS' r, p_val = stats.pearsonr(emp_vector, pts_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'ED, GLK' r, p_val = stats.pearsonr(ed_vector, glk_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'EMP, GLK' r, p_val = stats.pearsonr(emp_vector, glk_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'ED only, GLK' r, p_val = stats.pearsonr(ed_only, glk_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'ED only, PTS' r, p_val = stats.pearsonr(ed_only, pts_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'EMP only, GLK' r, p_val = stats.pearsonr(emp_only, glk_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'EMP only, PTS' r, p_val = stats.pearsonr(emp_only, pts_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val nleaves = len(leaves) for name, count in name_2_count.iteritems(): pct = 100.0 * float(count) / float(nleaves) print '%.2f%% (%d of %d) have pathway %s' % (pct, count, nleaves, str(name)) v_or_accumulator = pylab.zeros(nleaves) v_and_accumulator = pylab.ones(nleaves) for v in path_vectors.values(): v_or_accumulator = np.logical_or(v_or_accumulator, v) v_and_accumulator = np.logical_and(v_and_accumulator, v) total_w_any = pylab.where(v_or_accumulator == True)[0].size total_w_all = pylab.where(v_and_accumulator == True)[0].size any_pct = 100.0 * float(total_w_any) / float(nleaves) all_pct = 100.0 * float(total_w_all) / float(nleaves) print '%.2f%% (%d of %d) have some pathway' % (any_pct, total_w_any, nleaves) print '%.2f%% (%d of %d) have all pathways' % (all_pct, total_w_any, nleaves) fname = 'Node_Counts.csv' f = open(fname, 'w') w = csv.writer(f) for leaf in leaves: taxon = leaf.taxon taxon_id = taxon.label w.writerow([taxon_id, leaf.count]) f.close()
def Main(): options, _ = MakeOpts().parse_args(sys.argv) assert options.tree_filename and options.tree_format assert options.pathways_filename and options.genome_db_filename assert options.output_filename print 'Reading tree from', options.tree_filename tree = dendropy.Tree() tree.read_from_path(options.tree_filename, options.tree_format, extract_comment_metadata=True) leaves = tree.leaf_nodes() print 'Tree has %d leaf nodes' % len(leaves) pathways = pathway.LoadPathways(options.pathways_filename) db = genome_db.GenomeDB(options.genome_db_filename) pathway_names = [util.slugify(p.name) for p in pathways] org_2_pathways = {} for path in pathways: org_counts = Counter() for enz_set in path.enzyme_sets: orgs_w_enz = set() for ec in enz_set: orgs_w_enz.update(list(db.OrganismsForEC(ec))) org_counts += Counter(orgs_w_enz) orgs_w_pathway = [o for o,c in org_counts.iteritems() if c == len(path.enzyme_sets)] orgs_w_pathway = filter(None, map(db.KEGG2NCBI, orgs_w_pathway)) for org in orgs_w_pathway: org_2_pathways.setdefault(org, set()).add(util.slugify(path.name)) print 'Finding oxygen requirements' org_2_oxy_req = {} for row in db.SelectOrganismFields(['ncbi_taxon_id', 'broad_oxygen_requirement']): ncbi_id, oxy_req = row org_2_oxy_req[ncbi_id] = oxy_req observed_oxygen_reqs = set(org_2_oxy_req.values()) print 'Observed oxygen requirements', observed_oxygen_reqs # Find the organisms that have pathway tags. all_labels = set([l.taxon.label for l in leaves]) pathway_orgs = set(org_2_pathways.keys()) intersect = all_labels.intersection(pathway_orgs) print len(intersect), 'pathway orgs found' print len(pathway_orgs) - len(intersect), 'pathway orgs not found' # Find organisms that are heterotrophs if options.only_heterotrophs: print 'Pruning non-heterotrophs' q = db.db.Execute('SELECT ncbi_taxon_id, energy_category from organisms') ncbi_to_keep = set() for row in q: ncbi_id, energy_cat = row if energy_cat and energy_cat.lower() == 'organic': ncbi_to_keep.add(ncbi_id.strip()) tree.retain_taxa_with_labels(ncbi_to_keep) leaves = tree.leaf_nodes() print 'Tree now contains', len(leaves), 'leaves' lengths = [] for e in tree.leaf_edge_iter(): lengths.append(e.length) lengths = pylab.array(lengths) below_thresh = pylab.find(lengths < options.edge_length_threshold).size pct_below = 100.0 * float(below_thresh) / float(len(lengths)) print 'Median length', pylab.median(lengths) print 'Mean length', pylab.mean(lengths) print pct_below, '% below threshold' print 'Pruning leaves' for l in tree.leaf_nodes(): label = l.taxon.label pathways = org_2_pathways.get(label, set()) l.pathways = pathways for pname in pathways: setattr(l, pname, True) l.annotate(pname) oxy_req = org_2_oxy_req.get(label, None) l.oxygen_req = {oxy_req: 1} l.annotate('oxygen_req') l.count = 1 l.annotate('count') while True: changed = False for e in tree.leaf_edge_iter(): if (e.tail_node is not None and e.length < options.edge_length_threshold): changed |= MaybeMergeChildren(e.tail_node) if not changed: break tree.write_to_path(options.output_filename, 'nexus', suppress_annotations=True) leaves = tree.leaf_nodes() print 'Tree now has %d leaf nodes' % len(leaves) colormap = {'upper_emp_unique': '#008837', 'upper_ed_unique': '#7b3294', 'nonp_ed_unique': '#868800'} default_color = '#c0c3c7' name_2_count = {} path_vectors = {} for name in pathway_names: fname = util.slugify(name) + '.csv' f = open(fname, 'w') w = csv.writer(f) v = [] for leaf in leaves: taxon_id = leaf.taxon.label pathways = leaf.pathways d = leaf.annotations() value = d.get(name, (False, None))[0] color = default_color if value is True: color = colormap.get(name, 'default') name_2_count[name] = name_2_count.get(name, 0) + 1 v.append(1) else: v.append(0) w.writerow([taxon_id, color, value]) path_vectors[name] = pylab.array(v) f.close() ed_vector = path_vectors['upper_ed_unique'] emp_vector = path_vectors['upper_emp_unique'] r, p_val = stats.pearsonr(ed_vector, emp_vector) print 'Pearson correlation coefficient (r)', r print 'R^2', r**2 print 'p-value', p_val mat_shape = (len(observed_oxygen_reqs), 4) count_mat = np.matrix(np.zeros(mat_shape)) prob_mat = np.matrix(np.zeros(mat_shape)) oxy_req_idx_map = dict((i, k) for i, k in enumerate(sorted(observed_oxygen_reqs))) column_labels = {0: 'None', 1: 'EMP Only', 2: 'ED Only', 3: 'Both'} for leaf_idx, leaf in enumerate(leaves): d = leaf.annotations() oxygen_reqs = d.get('oxygen_req', (None, None))[0] total_count = float(sum(oxygen_reqs.values())) ed_presence = ed_vector[leaf_idx] emp_presence = emp_vector[leaf_idx] for i, oxy_req in oxy_req_idx_map.iteritems(): oxy_frac = oxygen_reqs.get(oxy_req, 0) / total_count if ed_presence and emp_presence: count_mat[i, 3] += oxy_frac elif ed_presence: count_mat[i, 2] += oxy_frac elif emp_presence: count_mat[i, 1] += oxy_frac else: count_mat[i, 0] += oxy_frac total_samples = float(np.sum(count_mat)) ed_samples = float(np.sum(ed_vector)) emp_samples = float(np.sum(emp_vector)) ed_prob = ed_samples / total_samples emp_prob = emp_samples / total_samples for i, oxy_req in oxy_req_idx_map.iteritems(): oxy_req_count = float(np.sum(count_mat[i,:])) oxy_req_prob = oxy_req_count / total_samples # Probability of neither pathway prob_mat[i, 0] = (1-ed_prob) * (1-emp_prob) * oxy_req_prob # Probability of EMP only prob_mat[i, 1] = (1-ed_prob) * (emp_prob) * oxy_req_prob # Probability of ED only prob_mat[i, 2] = (ed_prob) * (1-emp_prob) * oxy_req_prob # Probability of both prob_mat[i, 3] = (ed_prob) * (emp_prob) * oxy_req_prob p_vals = mystats.CalcPValues(count_mat, prob_mat) print 'Counts' print count_mat print 'Probabilities assuming random sampling' print prob_mat print 'Mappings' print oxy_req_idx_map print column_labels print 'P-values' print p_vals # Plot the p-values pylab.figure() xs = sorted(column_labels.keys()) xticks = [column_labels[i] for i in xs] ys = sorted(oxy_req_idx_map.keys()) yticks = [oxy_req_idx_map[j] for j in ys] sigs = p_vals < 0.05 super_sigs = p_vals < 0.001 rows, cols = sigs.shape for i in xrange(rows): for j in xrange(cols): if super_sigs[i,j]: print oxy_req_idx_map[i], 'x', column_labels[j], print '**', p_vals[i,j] pylab.text(j, i, '**', color='w') elif sigs[i,j]: print oxy_req_idx_map[i], 'x', column_labels[j], print '*', p_vals[i,j] pylab.text(j, i, '*', color='w') pylab.imshow(p_vals, interpolation='nearest') pylab.xticks(xs, xticks) pylab.yticks(ys, yticks) pylab.colorbar() # Plot the bar plot breakdown. restricted_counts = np.matrix(np.zeros((3,3))) allowed_genotypes = ['ED Only', 'Both', 'EMP Only'] allowed_phenotypes = ['anaerobe', 'facultative', 'aerobe'] for j, genotype in column_labels.iteritems(): if genotype not in allowed_genotypes: continue new_j = allowed_genotypes.index(genotype) for i, phenotype in oxy_req_idx_map.iteritems(): if phenotype not in allowed_phenotypes: continue new_i = allowed_phenotypes.index(phenotype) restricted_counts[new_i, new_j] = count_mat[i,j] pcts_matrix = restricted_counts / np.sum(restricted_counts, 1) print 'Phenotypes (rows)' print allowed_phenotypes print 'Genotypes (cols)' print allowed_genotypes print 'Counts of interesting categories' print restricted_counts print 'PCTs including interesting categories' print pcts_matrix * 100.0 print 'Effective number of organisms', np.sum(restricted_counts) colors = ['#37DD6F', '#FF5D40', '#4186D3'] pylab.figure() current_bottom = pylab.zeros(3) rows, cols = pcts_matrix.shape for j in xrange(cols): heights = np.array(pcts_matrix[:,j].flat) xs = np.arange(heights.size) pylab.bar(xs, heights, bottom=current_bottom, color=colors[j], edgecolor='w', label=allowed_genotypes[j], align='center') current_bottom += heights xs = pylab.arange(3) + 0.5 pylab.xticks(xs, allowed_phenotypes) pylab.legend() pylab.show() colormap = {'Organic': '#ff0000', 'Inorganic': '#00ff00', 'aerobe': '#2861e4', 'anaerobe': '#e44228', 'facultative': '#e4c328'} fname = 'trophism.csv' f = open(fname, 'w') w = csv.writer(f) for l in leaves: label = l.taxon.label cat = db.NCBI2EnergyCategory(label) color = colormap.get(cat, default_color) w.writerow([label, color, cat]) f.close() fname = 'oxy_req.csv' f = open(fname, 'w') w = csv.writer(f) for l in leaves: label = l.taxon.label cat = db.NCBI2BroadOxygenReq(label) color = colormap.get(cat, default_color) w.writerow([label, color, cat]) f.close() nleaves = len(leaves) for name, count in name_2_count.iteritems(): pct = 100.0 * float(count) / float(nleaves) print '%.2f%% (%d of %d) have pathway %s' % (pct, count, nleaves, str(name)) v_or_accumulator = pylab.zeros(nleaves) v_and_accumulator = pylab.ones(nleaves) for v in path_vectors.values(): v_or_accumulator = np.logical_or(v_or_accumulator, v) v_and_accumulator = np.logical_and(v_and_accumulator, v) total_w_any = pylab.where(v_or_accumulator == True)[0].size total_w_all = pylab.where(v_and_accumulator == True)[0].size any_pct = 100.0 * float(total_w_any) / float(nleaves) all_pct = 100.0 * float(total_w_all) / float(nleaves) print '%.2f%% (%d of %d) have some pathway' % (any_pct, total_w_any, nleaves) print '%.2f%% (%d of %d) have all pathways' % (all_pct, total_w_all, nleaves) fname = 'Node_Counts.csv' f = open(fname, 'w') w = csv.writer(f) for leaf in leaves: taxon = leaf.taxon taxon_id = taxon.label w.writerow([taxon_id, leaf.count]) f.close()
def Main(): options, _ = MakeOpts().parse_args(sys.argv) assert options.tree_filename and options.tree_format assert options.pathways_filename and options.genome_db_filename assert options.output_filename print 'Reading tree from', options.tree_filename tree = dendropy.Tree() tree.read_from_path(options.tree_filename, options.tree_format, extract_comment_metadata=True) leaves = tree.leaf_nodes() print 'Tree has %d leaf nodes' % len(leaves) pathways = pathway.LoadPathways(options.pathways_filename) db = genome_db.GenomeDB(options.genome_db_filename) pathway_names = [util.slugify(p.name) for p in pathways] org_2_pathways = {} for path in pathways: org_counts = Counter() for enz_set in path.enzyme_sets: orgs_w_enz = set() for ec in enz_set: orgs_w_enz.update(list(db.OrganismsForEC(ec))) org_counts += Counter(orgs_w_enz) orgs_w_pathway = [o for o,c in org_counts.iteritems() if c == len(path.enzyme_sets)] orgs_w_pathway = filter(None, map(db.KEGG2NCBI, orgs_w_pathway)) for org in orgs_w_pathway: org_2_pathways.setdefault(org, set()).add(util.slugify(path.name)) # Find the organisms that have pathway tags. all_labels = set([l.taxon.label for l in leaves]) pathway_orgs = set(org_2_pathways.keys()) intersect = all_labels.intersection(pathway_orgs) print len(intersect), 'pathway orgs found' print len(pathway_orgs) - len(intersect), 'pathway orgs not found' # Find organisms that are heterotrophs if options.only_heterotrophs: print 'Pruning non-heterotrophs' q = db.db.Execute('SELECT ncbi_taxon_id, energy_category from organisms') ncbi_to_keep = set() for row in q: ncbi_id, energy_cat = row if energy_cat and energy_cat.lower() == 'organic': ncbi_to_keep.add(ncbi_id.strip()) tree.retain_taxa_with_labels(ncbi_to_keep) leaves = tree.leaf_nodes() print 'Tree now contains', len(leaves), 'leaves' lengths = [] for e in tree.leaf_edge_iter(): lengths.append(e.length) lengths = pylab.array(lengths) below_thresh = pylab.find(lengths < options.edge_length_threshold).size pct_below = 100.0 * float(below_thresh) / float(len(lengths)) print 'Median length', pylab.median(lengths) print 'Mean length', pylab.mean(lengths) print pct_below, '% below threshold' print 'Pruning leaves' for l in tree.leaf_nodes(): label = l.taxon.label pathways = org_2_pathways.get(label, set()) l.pathways = pathways for pname in pathways: setattr(l, pname, True) l.annotate(pname) l.count = 1 l.annotate('count') while True: changed = False for e in tree.leaf_edge_iter(): if (e.tail_node is not None and e.length < options.edge_length_threshold): changed |= MaybeMergeChildren(e.tail_node) if not changed: break tree.write_to_path(options.output_filename, 'nexus', suppress_annotations=True) leaves = tree.leaf_nodes() print 'Tree now has %d leaf nodes' % len(leaves) colormap = {'upper_emp_unique': '#008837', 'upper_ed_unique': '#7b3294', 'pts': '#868800', 'glucokinase': '#002288'} default_color = '#c0c3c7' name_2_count = {} path_vectors = {} for name in pathway_names: fname = util.slugify(name) + '.csv' f = open(fname, 'w') w = csv.writer(f) v = [] for leaf in leaves: taxon_id = leaf.taxon.label pathways = leaf.pathways d = leaf.annotations() value = d.get(name, (False, None))[0] color = default_color if value is True: color = colormap[name] name_2_count[name] = name_2_count.get(name, 0) + 1 v.append(1) else: v.append(0) w.writerow([taxon_id, color, value]) path_vectors[name] = pylab.array(v) f.close() pts_vector = path_vectors['pts'] glk_vector = path_vectors['glucokinase'] ed_vector = path_vectors['upper_ed_unique'] emp_vector = path_vectors['upper_emp_unique'] ed_only = np.logical_and(ed_vector, np.logical_not(emp_vector)) emp_only = np.logical_and(emp_vector, np.logical_not(ed_vector)) print 'ED, EMP' r, p_val = stats.pearsonr(ed_vector, emp_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'ED, PTS' r, p_val = stats.pearsonr(ed_vector, pts_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'EMP, PTS' r, p_val = stats.pearsonr(emp_vector, pts_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'ED, GLK' r, p_val = stats.pearsonr(ed_vector, glk_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'EMP, GLK' r, p_val = stats.pearsonr(emp_vector, glk_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'ED only, GLK' r, p_val = stats.pearsonr(ed_only, glk_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'ED only, PTS' r, p_val = stats.pearsonr(ed_only, pts_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'EMP only, GLK' r, p_val = stats.pearsonr(emp_only, glk_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val print 'EMP only, PTS' r, p_val = stats.pearsonr(emp_only, pts_vector) print 'R', r print 'R^2', r**2 print 'p-value', p_val nleaves = len(leaves) for name, count in name_2_count.iteritems(): pct = 100.0 * float(count) / float(nleaves) print '%.2f%% (%d of %d) have pathway %s' % (pct, count, nleaves, str(name)) v_or_accumulator = pylab.zeros(nleaves) v_and_accumulator = pylab.ones(nleaves) for v in path_vectors.values(): v_or_accumulator = np.logical_or(v_or_accumulator, v) v_and_accumulator = np.logical_and(v_and_accumulator, v) total_w_any = pylab.where(v_or_accumulator == True)[0].size total_w_all = pylab.where(v_and_accumulator == True)[0].size any_pct = 100.0 * float(total_w_any) / float(nleaves) all_pct = 100.0 * float(total_w_all) / float(nleaves) print '%.2f%% (%d of %d) have some pathway' % (any_pct, total_w_any, nleaves) print '%.2f%% (%d of %d) have all pathways' % (all_pct, total_w_any, nleaves) fname = 'Node_Counts.csv' f = open(fname, 'w') w = csv.writer(f) for leaf in leaves: taxon = leaf.taxon taxon_id = taxon.label w.writerow([taxon_id, leaf.count]) f.close()
class GenomeDB(object): OXY_REQ = 'Oxygen Requirement' CSV_HEADERS = [ 'Genome Name', 'Strain', 'Genome Status', 'KEGG ID', 'NCBI Taxon ID', 'Project ID', 'RefSeq Project ID', 'Super Kingdom', 'Genus', 'Gram Stain', 'Shape', 'Motility', 'Pathogenic in', 'Genome Size', 'GC Content', 'Salinity', 'Temperature Range', 'Habitat', OXY_REQ, 'Energy Source', 'Energy Category', 'Metabolism', 'Sequencing Center' ] CSV_HEADER_MAPPING = dict((util.slugify(h), h) for h in CSV_HEADERS) CSV_HEADER_MAPPING['phylogenetic_group'] = 'Group' CSV_HEADER_MAPPING['phylogenetic_order'] = 'Order' ORG_TABLE_HEADERS = map(util.slugify, CSV_HEADERS) + ['broad_oxygen_requirement'] ENZ_TABLE_HEADERS = ['organism', 'EC'] def __init__(self, db_filename): self.db = database.SqliteDatabase(db_filename) def _InitTables(self): self.db.CreateTable('organisms', self.ORG_TABLE_HEADERS, drop_if_exists=False) self.db.CreateTable('organism_enzymes', self.ENZ_TABLE_HEADERS, drop_if_exists=False) def OrganismsForEC(self, ec): q = self.db.Execute( "SELECT organism FROM organism_enzymes WHERE EC='%s'" % ec) for i in q: yield i[0] def KEGG2NCBI(self, kegg_id): q = self.db.Execute( "SELECT ncbi_taxon_id FROM organisms WHERE kegg_id='%s'" % kegg_id) q = list(q) if not q: return None return q[0][0] def NCBI2EnergyCategory(self, ncbi_taxon): q = self.db.Execute( "SELECT energy_category FROM organisms WHERE ncbi_taxon_id='%s'" % ncbi_taxon) q = list(q) if not q: return None return q[0][0] def NCBI2BroadOxygenReq(self, ncbi_taxon): q = self.db.Execute( "SELECT broad_oxygen_requirement FROM organisms WHERE ncbi_taxon_id='%s'" % ncbi_taxon) q = list(q) if not q: return None return q[0][0] def KEGG2BroadOxygenReq(self, kegg_id): q = self.db.Execute( "SELECT broad_oxygen_requirement FROM organisms WHERE kegg_id='%s'" % kegg_id) q = list(q) if not q: return None return q[0][0] def KEGG2EnergyCategory(self, kegg_id): q = self.db.Execute( "SELECT energy_category FROM organisms WHERE kegg_id='%s'" % kegg_id) q = list(q) if not q: return None return q[0][0] def KEGG2EnergySource(self, kegg_id): q = self.db.Execute( "SELECT energy_source FROM organisms WHERE kegg_id='%s'" % kegg_id) q = list(q) if not q: return None return q[0][0] def KEGG2Metabolism(self, kegg_id): q = self.db.Execute( "SELECT metabolism FROM organisms WHERE kegg_id='%s'" % kegg_id) q = list(q) if not q: return None return q[0][0] def SelectOrganismFields(self, fields): query = 'SELECT %s FROM organisms' % ', '.join(fields) return self.db.Execute(query) @staticmethod def GetBroadyOxyReq(req_str): if not req_str: return None l_req = req_str.lower() if 'anaer' in l_req: return 'anaerobe' if 'facult' in l_req: return 'facultative' if 'microaero' in l_req: return 'microaerophile' if 'aerob': return 'aerobe' raise ValueError('Couldnt Parse!') def Populate(self, filename): """Populates the database from files.""" self._InitTables() f = open(filename) r = csv.DictReader(f) for row in r: insert_row = [] for table_header in self.ORG_TABLE_HEADERS: if table_header not in self.CSV_HEADER_MAPPING: insert_row.append(None) continue csv_header = self.CSV_HEADER_MAPPING[table_header] val = row.get(csv_header, None) if val and val.strip(): insert_row.append(val) else: insert_row.append(None) oxy_req = row.get(self.OXY_REQ, None) broad_req = self.GetBroadyOxyReq(oxy_req) insert_row[-1] = broad_req self.db.Insert('organisms', insert_row) f.close() k = Kegg.getInstance(loadFromAPI=False) enzyme_map = k.ec2enzyme_map for ec, enzyme in enzyme_map.iteritems(): for org in enzyme.genes.keys(): self.db.Insert('organism_enzymes', [org.lower(), ec])