def __init__(self, list_of_models=[], identifier=None, name=None): Object.__init__(self, identifier, name) if len(list_of_models) > 1: if not all(isinstance(x, Model) for x in list_of_models): raise AttributeError( "list_of_models may only contain cobra.core.Model objects") if len([model.id for model in list_of_models]) > \ len(set([model.id for model in list_of_models])): raise AssertionError( "Ensemble members cannot have duplicate model ids.") self.features = DictList() self._populate_features_base(list_of_models) self.members = DictList() self._populate_members(list_of_models) else: if len(list_of_models) == 0: self.base_model = Model(id_or_model=identifier+'_base_model',\ name=name) else: if not isinstance(list_of_models[0], Model): raise AttributeError( "list_of_models may only contain cobra.core.Model objects" ) self.base_model = list_of_models[0]
def add_bigg_metabolites(bigg_list, model): """ Create a COBRA metabolite from a BiGG metabolite. Parameters ---------- bigg_list: list of dict List of dictionaries with BiGG metabolite data model: cobra.core.Model Model to add metabolites to """ # Create a Metabolite object for each BiGG metabolite. metabolites = DictList() for bigg_metabolite in bigg_list: # Available data is different for a metabolite from an organism model versus # a metabolite from the universal model. if 'compartment_bigg_id' in bigg_metabolite: compartment = bigg_metabolite['compartment_bigg_id'] elif 'compartments_in_models' in bigg_metabolite: compartment = bigg_metabolite['compartments_in_models'][0][ 'bigg_id'] else: raise ValueError( 'BiGG metabolite {0} does not have a compartment'.format( bigg_metabolite['bigg_id'])) metabolite = Metabolite(id='{0}_{1}'.format(bigg_metabolite['bigg_id'], compartment), name=bigg_metabolite['name'], compartment=compartment) try: metabolite.formula = bigg_metabolite['formula'] except KeyError: try: if len(bigg_metabolite['formulae']) > 0: metabolite.formula = bigg_metabolite['formulae'][0] except KeyError: pass try: metabolite.charge = bigg_metabolite['charge'] except KeyError: try: if len(bigg_metabolite['charges']) > 0: metabolite.charge = bigg_metabolite['charges'][0] except KeyError: pass if len(bigg_metabolite['database_links']) > 0: metabolite.notes['aliases'] = bigg_metabolite['database_links'] metabolites.append(metabolite) if compartment not in model.compartments: try: model.compartments[compartment] = bigg_metabolite[ 'compartment_name'] except KeyError: model.compartments[compartment] = 'unknown' # Add all of the metabolites to the model. model.add_metabolites(metabolites) return
def test_independent(): a = DictList([Object("o1"), Object("o2")]) b = DictList() assert "o1" in a assert "o1" not in b b.append(Object("o3")) assert "o3" not in a assert "o3" in b
def __init__(self, atlas_name, root_dir, reference_gempro, reference_genome_path=None, description=None): """Prepare a GEM-PRO model for ATLAS analysis Args: atlas_name (str): Name of your ATLAS project root_dir (str): Path to where the folder named after ``atlas_name`` will be created. reference_gempro (GEMPRO): GEM-PRO model to use as the reference genome reference_genome_path (str): Path to reference genome FASTA file description (str): Optional string to describe your project """ Object.__init__(self, id=atlas_name, description=description) # Create directories self._root_dir = None self.root_dir = root_dir self.strains = DictList() self.df_orthology_matrix = pd.DataFrame() # Mark if the orthology matrix has gene IDs (thus we need to retrieve seqs from the genome file) or if # it is in the orthology matrix itself self._orthology_matrix_has_sequences = False # Load the GEM-PRO (could be a model, could just be a list of genes) # Check if there is a genome file associated with this model - if not, write all sequences and use that self.reference_gempro = reference_gempro if not reference_genome_path and not self.reference_gempro.genome_path: self.reference_gempro.genome_path = self.reference_gempro.write_representative_sequences_file( outname=self.reference_gempro.id) else: self.reference_gempro.genome_path = reference_genome_path # TODO: must also check if reference_genome_path gene IDs can be matched to the reference_gempro # Also create an attribute self._empty_reference_gempro = None if self.reference_gempro.model: # If there is a SBML model associated with the GEMPRO, copy that model self._empty_reference_gempro = GEMPRO( gem_name='Copied reference GEM-PRO', gem=self.reference_gempro.model.copy()) # Reset the GenePro attributes for x in self._empty_reference_gempro.genes: x.reset_protein() else: # Otherwise, just copy the list of genes over and rename the IDs strain_genes = [x.id for x in self.reference_gempro.genes] if len(strain_genes) == 0: raise ValueError( 'GEM-PRO has no genes, unable to run multi-strain analysis' ) self._empty_reference_gempro = GEMPRO( gem_name='Copied reference GEM-PRO', genes_list=strain_genes)
def test_init_copy(dict_list): obj, test_list = dict_list test_list.append(Object("test2")) copied = DictList(test_list) assert test_list is not copied assert isinstance(copied, test_list.__class__) assert len(test_list) == len(copied) for i, v in enumerate(test_list): assert test_list[i].id == copied[i].id assert i == copied.index(v.id) assert test_list[i] is copied[i] assert v is copied.get_by_id(v.id)
def __init__(self, filename): """ Initialize object. Parameters ---------- filename : str Path to database file """ self.filename = filename self.records = DictList() return
def subunits(self): """DictList: Subunits represented as a DictList of Protein objects""" # TODO: [VizRecon] # TODO: will need to adapt this to allow for input of previously created Protein objects subunits = DictList() for s in self.subunit_dict: subunits.append( Protein(ident=s, description='Subunit of complex {}'.format(self.id), root_dir=self.complex_dir, pdb_file_type=self.pdb_file_type)) return subunits
def filter_out_spontaneous_genes(genes, custom_spont_id=None): """Return the DictList of genes that are not spontaneous in a model. Args: genes (DictList): Genes DictList custom_spont_id (str): Optional custom spontaneous ID if it does not match the regular expression ``[Ss](_|)0001`` Returns: DictList: genes excluding ones that are spontaneous """ new_genes = DictList() for gene in genes: if not is_spontaneous(gene, custom_id=custom_spont_id): new_genes.append(gene) return new_genes
def test_sort_and_reverse(): dl = DictList(Object("test%d" % (i)) for i in reversed(range(10))) assert dl[0].id == "test9" dl.sort() assert len(dl) == 10 assert dl[0].id == "test0" assert dl.index("test0") == 0 dl.reverse() assert dl[0].id == "test9" assert dl.index("test0") == 9
def __init__(self, ident, description=None, chains=None, mapped_chains=None, is_experimental=False, structure_path=None, file_type=None): """Initialize a StructProp object. Args: ident (str): Unique identifier for this structure description (str): Optional human-readable description chains (str, list): Chain ID or list of IDs mapped_chains (str, list): A chain ID or IDs to indicate what chains should be analyzed is_experimental (bool): Flag to indicate if structure is an experimental or computational model structure_path (str): Path to structure file file_type (str): Type of structure file - ``pdb``, ``pdb.gz``, ``mmcif``, ``cif``, ``cif.gz``, ``xml.gz``, ``mmtf``, ``mmtf.gz`` """ Object.__init__(self, id=ident, description=description) self.is_experimental = is_experimental # Chain information # chains is a DictList of ChainProp objects # If you run self.parse_structure(), all chains will be parsed and stored here # Use mapped_chains below to keep track of chains you are interested in self.chains = DictList() if chains: self.add_chain_ids(chains) # mapped_chains is an ordered list of mapped chain IDs which would come from BLAST or the best_structures API self.mapped_chains = [] if mapped_chains: self.add_mapped_chain_ids(mapped_chains) # File information self.file_type = file_type self._structure_dir = None self.structure_file = None if structure_path: self.load_structure_path(structure_path, file_type)
def filter_reaction_by_subsystems(self): subsystem2reactions = {} for reaction in self.reactions: subsystem2reactions.setdefault(reaction.subsystem, []) subsystem2reactions[reaction.subsystem].append(reaction) fva_reactions = DictList() for subsys, reactions in subsystem2reactions.items(): reactions = sorted(reactions, key=lambda x: sum([ len([r for r in m.reactions if r != x]) for m in x.metabolites ]), reverse=True) for i in range(3): if i == len(reactions): break fva_reactions.append(reactions[i]) return fva_reactions
def __init__(self, ident, description=None, chains=None, mapped_chains=None, is_experimental=False, structure_path=None, file_type=None): Object.__init__(self, id=ident, description=description) self.is_experimental = is_experimental """bool: Flag to note if this structure is an experimental model or a homology model""" # Chain information # chains is a DictList of ChainProp objects # If you run self.parse_structure(), all chains will be parsed and stored here # Use mapped_chains below to keep track of chains you are interested in self.chains = DictList() """DictList: A DictList of chains have their sequence stored in them, along with residue-specific""" if chains: self.add_chain_ids(chains) # mapped_chains is an ordered list of mapped chain IDs which would come from BLAST or the best_structures API self.mapped_chains = [] """list: A simple list of chain IDs (strings) that will be used to subset analyses""" if mapped_chains: self.add_mapped_chain_ids(mapped_chains) self.parsed = False """bool: Simple flag to track if this structure has had its structure + chain sequences parsed""" # XTODO: rename to sequence_parsed or something similar # File information self.file_type = file_type """str: Type of structure file""" self._structure_dir = None self.structure_file = None """str: Name of the structure file""" if structure_path: self.load_structure_path(structure_path, file_type) self.structure = None """Structure: Biopython Structure object, only used if ``store_in_memory`` option of ``parse_structure`` is set to True"""
def add_bigg_reactions(bigg_list, model, ignore_pseudo_reactions=True): """ Create a COBRA reaction from a BiGG reaction. Parameters ---------- bigg_list : list of dict List of dictionaries with BiGG reaction data model : cobra.core.Model Model to add reactions to ignore_pseudo_reactions : bool, optional When True, do not include pseudo reactions """ # Create a Reaction object for each BiGG reaction. reactions = DictList() for bigg_reaction in bigg_list: if bigg_reaction['pseudoreaction'] and ignore_pseudo_reactions: continue reaction = Reaction(id=bigg_reaction['bigg_id'], name=bigg_reaction['name']) reaction.notes['aliases'] = bigg_reaction['database_links'] metabolites = dict() for met in bigg_reaction['metabolites']: metabolite = model.metabolites.get_by_id('{0}_{1}'.format( met['bigg_id'], met['compartment_bigg_id'])) metabolites[metabolite] = met['stoichiometry'] reaction.add_metabolites(metabolites) try: reaction.bounds = (bigg_reaction['results'][0]['lower_bound'], bigg_reaction['results'][0]['upper_bound']) except KeyError: if '⇌' in bigg_reaction['reaction_string']: reaction.bounds = (-1000.0, 1000.0) else: warn('Unknown direction symbol in reaction string {0}'.format( bigg_reaction['reaction_string'])) reactions.append(reaction) # Add all of the reactions to the model. model.add_reactions(reactions) return
def __json_decode__(self, **attrs): """build a model from a dict""" Model.__init__(self) if 'reactions' not in attrs: raise Exception('JSON object has no reactions attribute. Cannot load.') self.add_metabolites( [cobra.io.dict.metabolite_from_dict(metabolite) for metabolite in attrs['metabolites']] ) self.genes = DictList(attrs['genes']) self.add_reactions( [cobra.io.dict.reaction_from_dict(reaction, self) for reaction in attrs['reactions']] ) for k, v in attrs.items(): if k in {'id', 'name', 'notes', 'compartments', 'annotation'}: setattr(self, k, v)
def test_set(): obj_list = DictList(Object("test%d" % (i)) for i in range(10)) obj_list[4] = Object("testa") assert obj_list.index("testa") == 4 assert obj_list[4].id == "testa" obj_list[5:7] = [Object("testb"), Object("testc")] assert obj_list.index("testb") == 5 assert obj_list[5].id == "testb" assert obj_list.index("testc") == 6 assert obj_list[6].id == "testc" # Even if the object is unique, if it is present twice in the new # list, it should still raise an exception with pytest.raises(ValueError): obj_list.__setitem__(slice(5, 7), [Object("testd"), Object("testd")])
def __init__(self, ident, description=None, chains=None, mapped_chains=None, structure_path=None, file_type=None): StructProp.__init__(self, ident, description=description, chains=chains, mapped_chains=mapped_chains, is_experimental=True, structure_path=structure_path, file_type=file_type) self.experimental_method = None self.resolution = None self.date = None self.taxonomy_name = None self.biological_assemblies = DictList() """DictList: A list for storing Bioassembly objects related to this PDB ID"""
def test_removal(): obj_list = DictList(Object("test%d" % (i)) for i in range(2, 10)) del obj_list[3] assert "test5" not in obj_list assert obj_list.index(obj_list[-1]) == len(obj_list) - 1 assert len(obj_list) == 7 del obj_list[3:5] assert "test6" not in obj_list assert "test7" not in obj_list assert obj_list.index(obj_list[-1]) == len(obj_list) - 1 assert len(obj_list) == 5 removed = obj_list.pop(1) assert obj_list.index(obj_list[-1]) == len(obj_list) - 1 assert removed.id == "test3" assert "test3" not in obj_list assert len(obj_list) == 4 removed = obj_list.pop() assert removed.id == "test9" assert removed.id not in obj_list assert len(obj_list) == 3
def dict_list(): obj = Object("test1") test_list = DictList() test_list.append(obj) return obj, test_list
class KeggDatabase(object): """ Base class for managing a KEGG flat file database. """ def __init__(self, filename): """ Initialize object. Parameters ---------- filename : str Path to database file """ self.filename = filename self.records = DictList() return def get_record(self, handle): """ Get a record from a database file. Parameters ---------- handle : file handle File handle of database file Returns ------- list of str List of lines in record """ record = list() for line in handle: record.append(line.strip('\n')) if line[:3] == '///': yield record record = list() continue def store(self, file_name=None): """ Save the database to a flat file. Parameters ---------- file_name : str, optional Path to database file """ if file_name is None: file_name = self.filename # Convert all of the record objects to flat file database records and write to the file. self.records.sort() with open(file_name, 'w') as handle: for index in range(len(self.records)): for line in self.records[index].make_record(): handle.write(line + '\n') return def update(self, new_object): """ Update a record in the database (add new or replace existing record). Parameters ---------- new_object : object Record object to add or replace """ # Replace the current object if it already exists in the database. if self.records.has_id(new_object.id): self.records._replace_on_id(new_object) return # Add the new object to the database. self.records += [new_object] return def size(self): """ Get the number of records in the database. Returns ------- int Number of records in database """ return len(self.records) def has_id(self, id): """ Check if an ID exists in the database. Parameters ---------- id : str ID to check Returns ------- bool True when ID exists, otherwise False """ return self.records.has_id(id) def get_by_id(self, id): """ Get an record with the specified ID. Parameters ---------- id : str ID of record to return Returns ------- object Object with specified ID """ return self.records.get_by_id(id)
class Ensemble(Object): """ Ensemble of metabolic models Parameters ---------- identifier : string The identifier to associate with the ensemble as a string. list_of_models : list of cobra.core.model.Model Either a list of existing Model objects in which case a new Model object is instantiated and an ensemble is constructed using the list of Models, or None/empty list, in which case an ensemble is created with empty attributes. name : string Human-readable name for the ensemble Attributes ---------- base_model : Model A cobra.core.Model that contains all variable and invariable components of an ensemble. members : DictList A DictList where the key is the member identifier and the value is a medusa.core.member.Member object features : DictList A DictList where the key is the feature identifier and the value is a medusa.core.feature.Feature object """ def __init__(self, list_of_models=[], identifier=None, name=None): Object.__init__(self, identifier, name) if len(list_of_models) > 1: if not all(isinstance(x, Model) for x in list_of_models): raise AttributeError( "list_of_models may only contain cobra.core.Model objects") if len([model.id for model in list_of_models]) > \ len(set([model.id for model in list_of_models])): raise AssertionError( "Ensemble members cannot have duplicate model ids.") self.features = DictList() self._populate_features_base(list_of_models) self.members = DictList() self._populate_members(list_of_models) else: if len(list_of_models) == 0: self.base_model = Model(id_or_model=identifier+'_base_model',\ name=name) else: if not isinstance(list_of_models[0], Model): raise AttributeError( "list_of_models may only contain cobra.core.Model objects" ) self.base_model = list_of_models[0] def _populate_features_base(self, list_of_models): # Determine all reactions across all models and construct the base model all_reactions = set() base_model = list_of_models[0].copy() all_reactions = all_reactions | set( [rxn.id for rxn in base_model.reactions]) for model in list_of_models: new_reactions = set([rxn.id for rxn in model.reactions]) - \ all_reactions reactions_to_add = [ model.reactions.get_by_id(rxn) for rxn in new_reactions ] base_model.add_reactions(reactions_to_add) all_reactions = all_reactions | set( [rxn.id for rxn in model.reactions]) all_reactions = list(all_reactions) # Determine reactions that vary in any model and construct a feature for # each unique parameter value for that reaction in the ensemble variable_reactions = [] for reaction in all_reactions: rxn_vals = {} for model in list_of_models: rxn_vals[model.id] = {} if reaction in [x.id for x in model.reactions]: rxn = model.reactions.get_by_id(reaction) for reaction_attribute in REACTION_ATTRIBUTES: rxn_vals[model.id][reaction_attribute] = \ getattr(rxn,reaction_attribute) else: # for reactions not present in this model, select the default for reaction_attribute in REACTION_ATTRIBUTES: rxn_vals[model.id][reaction_attribute] = \ MISSING_ATTRIBUTE_DEFAULT[reaction_attribute] rxn_vals = pd.DataFrame(rxn_vals).T for reaction_attribute in REACTION_ATTRIBUTES: if len(rxn_vals[reaction_attribute].unique()) > 1: rxn_from_base = base_model.reactions.get_by_id(reaction) feature_id = rxn_from_base.id + '_' + reaction_attribute states = rxn_vals[reaction_attribute].to_dict() #states = {model.id:rxn_vals[model.id][reaction_attribute] for model in list_of_models} feature = Feature(ensemble=self,\ identifier=feature_id,\ name=rxn_from_base.name,\ base_component=rxn_from_base,\ component_attribute=reaction_attribute,\ states=states) self.features += [feature] variable_reactions.append(reaction) self.base_model = base_model def _populate_members(self, list_of_models): for model in list_of_models: model_states = dict() for feature in self.features: model_states[feature] = feature.get_model_state(model.id) member = Member(ensemble=self,\ identifier=model.id,\ name=model.name,\ states=model_states) self.members += [member] def set_state(self, member): """Set the state of the base model to represent a single member """ # if member was passed as an id, get the actual member object if isinstance(member, str): member = self.members.get_by_id(member) for feature in self.features: if isinstance(feature.base_component, cobra.core.Reaction): setattr(feature.base_component,\ feature.component_attribute,\ feature.states[member.id]) else: raise AttributeError( "Only cobra.core.Reaction supported for base_component type" ) def to_pickle(self, filename): """ Save an ensemble as a pickled object. Pickling is currently the only supported method for saving and loading ensembles. Parameters ---------- filename : String location to save the pickle. """ with open(filename, "wb") as outfile: dump(self, outfile, protocol=4)
class StructProp(Object): """Generic class to represent information for a protein structure. The main utilities of this class are to: * Provide access to the 3D coordinates using a Biopython Structure object through the method ``parse_structure``. * Run predictions and computations on the structure * Analyze specific chains using the ``mapped_chains`` attribute * Provide wrapper methods to ``nglview`` to view the structure in a Jupyter notebook Args: ident (str): Unique identifier for this structure description (str): Optional human-readable description chains (str, list): Chain ID or list of IDs mapped_chains (str, list): A chain ID or IDs to indicate what chains should be analyzed is_experimental (bool): Flag to indicate if structure is an experimental or computational model structure_path (str): Path to structure file file_type (str): Type of structure file - ``pdb``, ``pdb.gz``, ``mmcif``, ``cif``, ``cif.gz``, ``xml.gz``, ``mmtf``, ``mmtf.gz`` """ def __init__(self, ident, description=None, chains=None, mapped_chains=None, is_experimental=False, structure_path=None, file_type=None): Object.__init__(self, id=ident, description=description) self.is_experimental = is_experimental """bool: Flag to note if this structure is an experimental model or a homology model""" # Chain information # chains is a DictList of ChainProp objects # If you run self.parse_structure(), all chains will be parsed and stored here # Use mapped_chains below to keep track of chains you are interested in self.chains = DictList() """DictList: A DictList of chains have their sequence stored in them, along with residue-specific""" if chains: self.add_chain_ids(chains) # mapped_chains is an ordered list of mapped chain IDs which would come from BLAST or the best_structures API self.mapped_chains = [] """list: A simple list of chain IDs (strings) that will be used to subset analyses""" if mapped_chains: self.add_mapped_chain_ids(mapped_chains) self.parsed = False """bool: Simple flag to track if this structure has had its structure + chain sequences parsed""" # XTODO: rename to sequence_parsed or something similar # File information self.file_type = file_type """str: Type of structure file""" self._structure_dir = None self.structure_file = None """str: Name of the structure file""" if structure_path: self.load_structure_path(structure_path, file_type) self.structure = None """Structure: Biopython Structure object, only used if ``store_in_memory`` option of ``parse_structure`` is set to True""" @property def structure_dir(self): if not self._structure_dir: raise OSError('No structure folder set') return self._structure_dir @structure_dir.setter def structure_dir(self, path): if path and not op.exists(path): raise OSError('{}: folder does not exist'.format(path)) self._structure_dir = path @property def structure_path(self): if not self.structure_file: raise OSError('{}: structure file not available'.format(self.id)) path = op.join(self.structure_dir, self.structure_file) if not op.exists(path): raise ValueError('{}: file does not exist'.format(path)) return path def load_structure_path(self, structure_path, file_type): """Load a structure file and provide pointers to its location Args: structure_path (str): Path to structure file file_type (str): Type of structure file """ if not file_type: raise ValueError('File type must be specified') self.file_type = file_type self.structure_dir = op.dirname(structure_path) self.structure_file = op.basename(structure_path) def parse_structure(self, store_in_memory=False): """Read the 3D coordinates of a structure file and return it as a Biopython Structure object. Also create ChainProp objects in the chains attribute for each chain in the first model. Args: store_in_memory (bool): If the Biopython Structure object should be stored in the attribute ``structure``. Returns: Structure: Biopython Structure object """ # TODO: perhaps add option to parse into ProDy object? if not self.structure_file: log.error('{}: no structure file, unable to parse'.format(self.id)) return None else: # Add Biopython structure object structure = StructureIO(self.structure_path, self.file_type) # Add all chains to self.chains as ChainProp objects structure_chains = [x.id for x in structure.first_model.child_list] self.add_chain_ids(structure_chains) self.get_structure_seqs(structure.first_model) # Also add all chains to self.mapped_chains ONLY if there are none specified if not self.mapped_chains: self.add_mapped_chain_ids(structure_chains) self.parsed = True if store_in_memory: self.structure = structure return structure def clean_structure(self, out_suffix='_clean', outdir=None, force_rerun=False, remove_atom_alt=True, keep_atom_alt_id='A',remove_atom_hydrogen=True, add_atom_occ=True, remove_res_hetero=True, keep_chemicals=None, keep_res_only=None, add_chain_id_if_empty='X', keep_chains=None): """Clean the structure file associated with this structure, and save it as a new file. Returns the file path. Args: out_suffix (str): Suffix to append to original filename outdir (str): Path to output directory force_rerun (bool): If structure should be re-cleaned if a clean file exists already remove_atom_alt (bool): Remove alternate positions keep_atom_alt_id (str): If removing alternate positions, which alternate ID to keep remove_atom_hydrogen (bool): Remove hydrogen atoms add_atom_occ (bool): Add atom occupancy fields if not present remove_res_hetero (bool): Remove all HETATMs keep_chemicals (str, list): If removing HETATMs, keep specified chemical names keep_res_only (str, list): Keep ONLY specified resnames, deletes everything else! add_chain_id_if_empty (str): Add a chain ID if not present keep_chains (str, list): Keep only these chains Returns: str: Path to cleaned PDB file """ if not self.structure_file: log.error('{}: no structure file, unable to clean'.format(self.id)) return None clean_pdb_file = ssbio.protein.structure.utils.cleanpdb.clean_pdb(self.structure_path, out_suffix=out_suffix, outdir=outdir, force_rerun=force_rerun, remove_atom_alt=remove_atom_alt, remove_atom_hydrogen=remove_atom_hydrogen, keep_atom_alt_id=keep_atom_alt_id, add_atom_occ=add_atom_occ, remove_res_hetero=remove_res_hetero, keep_chemicals=keep_chemicals, keep_res_only=keep_res_only, add_chain_id_if_empty=add_chain_id_if_empty, keep_chains=keep_chains) return clean_pdb_file def add_mapped_chain_ids(self, mapped_chains): """Add chains by ID into the mapped_chains attribute Args: mapped_chains (str, list): Chain ID or list of IDs """ mapped_chains = ssbio.utils.force_list(mapped_chains) for c in mapped_chains: if c not in self.mapped_chains: self.mapped_chains.append(c) log.debug('{}: added to list of mapped chains'.format(c)) else: log.debug('{}: chain already in list of mapped chains, not adding'.format(c)) def add_chain_ids(self, chains): """Add chains by ID into the chains attribute Args: chains (str, list): Chain ID or list of IDs """ chains = ssbio.utils.force_list(chains) for c in chains: if self.chains.has_id(c): log.debug('{}: chain already present'.format(c)) else: chain_prop = ChainProp(ident=c, pdb_parent=self.id) self.chains.append(chain_prop) log.debug('{}: added to chains list'.format(c)) def get_structure_seqs(self, model): """Gather chain sequences and store in their corresponding ``ChainProp`` objects in the ``chains`` attribute. Args: model (Model): Biopython Model object of the structure you would like to parse """ # Don't overwrite existing ChainProp objects dont_overwrite = [] chains = list(model.get_chains()) for x in chains: if self.chains.has_id(x.id): if self.chains.get_by_id(x.id).seq_record: dont_overwrite.append(x.id) if len(dont_overwrite) == len(chains): log.debug('Not writing structure sequences, already stored') return # Returns the structures sequences with Xs added structure_seqs = ssbio.protein.structure.properties.residues.get_structure_seqrecords(model) log.debug('{}: gathered chain sequences'.format(self.id)) # Associate with ChainProps for seq_record in structure_seqs: log.debug('{}: adding chain sequence to ChainProp'.format(seq_record.id)) my_chain = self.chains.get_by_id(seq_record.id) my_chain.seq_record = seq_record def reset_chain_seq_records(self): for x in self.chains: x.reset_seq_record() def get_dict_with_chain(self, chain, only_keys=None, chain_keys=None, exclude_attributes=None, df_format=False): """get_dict method which incorporates attributes found in a specific chain. Does not overwrite any attributes in the original StructProp. Args: chain: only_keys: chain_keys: exclude_attributes: df_format: Returns: dict: attributes of StructProp + the chain specified """ # Choose attributes to return, return everything in the object if a list is not specified if not only_keys: keys = list(self.__dict__.keys()) else: keys = ssbio.utils.force_list(only_keys) # Remove keys you don't want returned if exclude_attributes: exclude_attributes = ssbio.utils.force_list(exclude_attributes) for x in exclude_attributes: if x in keys: keys.remove(x) else: exclude_attributes = [] exclude_attributes.extend(['mapped_chains', 'chains']) final_dict = {k: v for k, v in Object.get_dict(self, only_attributes=keys, exclude_attributes=exclude_attributes, df_format=df_format).items()} chain_prop = self.chains.get_by_id(chain) # Filter out keys that show up in StructProp if not chain_keys: chain_keys = [x for x in chain_prop.get_dict().keys() if x not in final_dict] chain_dict = chain_prop.get_dict(only_attributes=chain_keys, df_format=df_format) final_dict.update(chain_dict) return final_dict def find_disulfide_bridges(self, threshold=3.0): """Run Biopython's search_ss_bonds to find potential disulfide bridges for each chain and store in ChainProp. Will add a list of tuple pairs into the annotations field, looks like this:: [ ((' ', 79, ' '), (' ', 110, ' ')), ((' ', 174, ' '), (' ', 180, ' ')), ((' ', 369, ' '), (' ', 377, ' '))] Where each pair is a pair of cysteine residues close together in space. """ if self.structure: parsed = self.structure else: parsed = self.parse_structure() if not parsed: log.error('{}: unable to open structure to find S-S bridges'.format(self.id)) return disulfide_bridges = ssbio.protein.structure.properties.residues.search_ss_bonds(parsed.first_model, threshold=threshold) if not disulfide_bridges: log.debug('{}: no disulfide bridges found'.format(self.id)) for chain, bridges in disulfide_bridges.items(): self.chains.get_by_id(chain).seq_record.annotations['SSBOND-biopython'] = disulfide_bridges[chain] log.debug('{}: found {} disulfide bridges'.format(chain, len(bridges))) log.debug('{}: stored disulfide bridges in the chain\'s seq_record letter_annotations'.format(chain)) def get_dssp_annotations(self, outdir, force_rerun=False): """Run DSSP on this structure and store the DSSP annotations in the corresponding ChainProp SeqRecords Calculations are stored in the ChainProp's ``letter_annotations`` at the following keys: * ``SS-dssp`` * ``RSA-dssp`` * ``ASA-dssp`` * ``PHI-dssp`` * ``PSI-dssp`` Args: outdir (str): Path to where DSSP dataframe will be stored. force_rerun (bool): If DSSP results should be recalculated TODO: * Also parse global properties, like total accessible surface area. Don't think Biopython parses those? """ if self.structure: parsed = self.structure else: parsed = self.parse_structure() if not parsed: log.error('{}: unable to open structure to run DSSP'.format(self.id)) return log.debug('{}: running DSSP'.format(self.id)) dssp_results = ssbio.protein.structure.properties.dssp.get_dssp_df(model=parsed.first_model, pdb_file=self.structure_path, outdir=outdir, force_rerun=force_rerun) if dssp_results.empty: log.error('{}: unable to run DSSP'.format(self.id)) return chains = dssp_results.chain.unique() dssp_summary = ssbio.protein.structure.properties.dssp.secondary_structure_summary(dssp_results) for chain in chains: ss = dssp_results[dssp_results.chain == chain].ss.tolist() exposure_rsa = dssp_results[dssp_results.chain == chain].exposure_rsa.tolist() exposure_asa = dssp_results[dssp_results.chain == chain].exposure_asa.tolist() phi = dssp_results[dssp_results.chain == chain].phi.tolist() psi = dssp_results[dssp_results.chain == chain].psi.tolist() chain_prop = self.chains.get_by_id(chain) chain_seq = chain_prop.seq_record # Making sure the X's are filled in ss = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq, new_seq=ss, fill_with='-') exposure_rsa = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq, new_seq=exposure_rsa, fill_with=float('Inf')) exposure_asa = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq, new_seq=exposure_asa, fill_with=float('Inf')) phi = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq, new_seq=phi, fill_with=float('Inf')) psi = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq, new_seq=psi, fill_with=float('Inf')) chain_prop.seq_record.annotations.update(dssp_summary[chain]) chain_prop.seq_record.letter_annotations['SS-dssp'] = ss chain_prop.seq_record.letter_annotations['RSA-dssp'] = exposure_rsa chain_prop.seq_record.letter_annotations['ASA-dssp'] = exposure_asa chain_prop.seq_record.letter_annotations['PHI-dssp'] = phi chain_prop.seq_record.letter_annotations['PSI-dssp'] = psi log.debug('{}: stored DSSP annotations in chain seq_record letter_annotations'.format(chain)) def get_msms_annotations(self, outdir, force_rerun=False): """Run MSMS on this structure and store the residue depths/ca depths in the corresponding ChainProp SeqRecords """ # Now can run on Biopython Model objects exclusively thanks to Biopython updates # if self.file_type != 'pdb': # raise ValueError('{}: unable to run MSMS with "{}" file type. Please change file type to "pdb"'.format(self.id, # self.file_type)) if self.structure: parsed = self.structure else: parsed = self.parse_structure() if not parsed: log.error('{}: unable to open structure to run MSMS'.format(self.id)) return log.debug('{}: running MSMS'.format(self.id)) # PDB ID is currently set to the structure file so the output name is the same with _msms.df appended to it msms_results = ssbio.protein.structure.properties.msms.get_msms_df(model=parsed.first_model, pdb_id=self.structure_path, outdir=outdir, force_rerun=force_rerun) if msms_results.empty: log.error('{}: unable to run MSMS'.format(self.id)) return chains = msms_results.chain.unique() for chain in chains: res_depths = msms_results[msms_results.chain == chain].res_depth.tolist() ca_depths = msms_results[msms_results.chain == chain].ca_depth.tolist() chain_prop = self.chains.get_by_id(chain) chain_seq = chain_prop.seq_record # Making sure the X's are filled in res_depths = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq, new_seq=res_depths, fill_with=float('Inf')) ca_depths = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq, new_seq=ca_depths, fill_with=float('Inf')) chain_prop.seq_record.letter_annotations['RES_DEPTH-msms'] = res_depths chain_prop.seq_record.letter_annotations['CA_DEPTH-msms'] = ca_depths log.debug('{}: stored residue depths in chain seq_record letter_annotations'.format(chain)) def get_freesasa_annotations(self, outdir, include_hetatms=False, force_rerun=False): """Run ``freesasa`` on this structure and store the calculated properties in the corresponding ChainProps """ if self.file_type != 'pdb': log.error('{}: unable to run freesasa with "{}" file type. Please change file type to "pdb"'.format(self.id, self.file_type)) return # Parse the structure to store chain sequences if self.structure: parsed = self.structure else: parsed = self.parse_structure() if not parsed: log.error('{}: unable to open structure to run freesasa'.format(self.id)) return # Set outfile name log.debug('{}: running freesasa'.format(self.id)) if include_hetatms: outfile = '{}.freesasa_het.rsa'.format(self.id) else: outfile = '{}.freesasa_nohet.rsa'.format(self.id) # Run freesasa result = fs.run_freesasa(infile=self.structure_path, outfile=outfile, include_hetatms=include_hetatms, outdir=outdir, force_rerun=force_rerun) # Parse results result_parsed = fs.parse_rsa_data(result) prop_dict = defaultdict(lambda: defaultdict(list)) for k, v in result_parsed.items(): chain = k[0] for prop, calc in v.items(): prop_dict[chain][prop].append(calc) # Reorganize and store results all_props = ['all_atoms_abs', 'all_atoms_rel', 'side_chain_abs', 'side_chain_rel', 'main_chain_abs', 'main_chain_rel', 'non_polar_abs', 'non_polar_rel', 'all_polar_abs', 'all_polar_rel'] all_props_renamed = {'all_atoms_abs' : 'ASA_ALL-freesasa', 'all_atoms_rel' : 'RSA_ALL-freesasa', 'all_polar_abs' : 'ASA_POLAR-freesasa', 'all_polar_rel' : 'RSA_POLAR-freesasa', 'main_chain_abs': 'ASA_BACKBONE-freesasa', 'main_chain_rel': 'RSA_BACKBONE-freesasa', 'non_polar_abs' : 'ASA_NONPOLAR-freesasa', 'non_polar_rel' : 'RSA_NONPOLAR-freesasa', 'side_chain_abs': 'ASA_RESIDUE-freesasa', 'side_chain_rel': 'RSA_RESIDUE-freesasa'} ## Rename dictionary keys based on if HETATMs were included if include_hetatms: suffix = '_het' else: suffix = '_nohet' for k, v in all_props_renamed.items(): all_props_renamed[k] = v + suffix for chain in self.chains: for prop in all_props: prop_list = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain.seq_record, new_seq=prop_dict[chain.id][prop], fill_with=float('Inf'), ignore_excess=True) chain.seq_record.letter_annotations[all_props_renamed[prop]] = prop_list log.debug('{}: stored freesasa calculations in chain seq_record letter_annotations'.format(chain)) def view_structure(self, only_chains=None, opacity=1.0, recolor=False, gui=False): """Use NGLviewer to display a structure in a Jupyter notebook Args: only_chains (str, list): Chain ID or IDs to display opacity (float): Opacity of the structure recolor (bool): If structure should be cleaned and recolored to silver gui (bool): If the NGLview GUI should show up Returns: NGLviewer object """ # TODO: show_structure_file does not work for MMTF files - need to check for that and load accordingly if ssbio.utils.is_ipynb(): import nglview as nv else: raise EnvironmentError('Unable to display structure - not running in a Jupyter notebook environment') if not self.structure_file: raise ValueError("Structure file not loaded") only_chains = ssbio.utils.force_list(only_chains) to_show_chains = '( ' for c in only_chains: to_show_chains += ':{} or'.format(c) to_show_chains = to_show_chains.strip(' or ') to_show_chains += ' )' if self.file_type == 'mmtf' or self.file_type == 'mmtf.gz': view = nv.NGLWidget() view.add_component(self.structure_path) else: view = nv.show_structure_file(self.structure_path, gui=gui) if recolor: view.clear_representations() if only_chains: view.add_cartoon(selection='{} and (not hydrogen)'.format(to_show_chains), color='silver', opacity=opacity) else: view.add_cartoon(selection='protein', color='silver', opacity=opacity) elif only_chains: view.clear_representations() view.add_cartoon(selection='{} and (not hydrogen)'.format(to_show_chains), color='silver', opacity=opacity) return view def add_residues_highlight_to_nglview(self, view, structure_resnums, chain=None, res_color='red'): """Add a residue number or numbers to an NGLWidget view object. Args: view (NGLWidget): NGLWidget view object structure_resnums (int, list): Residue number(s) to highlight, structure numbering chain (str, list): Chain ID or IDs of which residues are a part of. If not provided, all chains in the mapped_chains attribute will be used. If that is also empty, and exception is raised. res_color (str): Color to highlight residues with """ if not chain: chain = self.mapped_chains if not chain: raise ValueError('Please input chain ID to display residue on') if isinstance(structure_resnums, list): structure_resnums = list(set(structure_resnums)) elif isinstance(structure_resnums, int): structure_resnums = ssbio.utils.force_list(structure_resnums) else: raise ValueError('Input must either be a residue number of a list of residue numbers') to_show_chains = '( ' for c in chain: to_show_chains += ':{} or'.format(c) to_show_chains = to_show_chains.strip(' or ') to_show_chains += ' )' to_show_res = '( ' for m in structure_resnums: to_show_res += '{} or '.format(m) to_show_res = to_show_res.strip(' or ') to_show_res += ' )' log.info('Selection: {} and not hydrogen and {}'.format(to_show_chains, to_show_res)) view.add_ball_and_stick(selection='{} and not hydrogen and {}'.format(to_show_chains, to_show_res), color=res_color) def add_scaled_residues_highlight_to_nglview(self, view, structure_resnums, chain=None, color='red', unique_colors=False, opacity_range=(0.5,1), scale_range=(.7, 10)): """Add a list of residue numbers (which may contain repeating residues) to a view, or add a dictionary of residue numbers to counts. Size and opacity of added residues are scaled by counts. Args: view (NGLWidget): NGLWidget view object structure_resnums (int, list, dict): Residue number(s) to highlight, or a dictionary of residue number to frequency count chain (str, list): Chain ID or IDs of which residues are a part of. If not provided, all chains in the mapped_chains attribute will be used. If that is also empty, and exception is raised. color (str): Color to highlight residues with unique_colors (bool): If each mutation should be colored uniquely (will override color argument) opacity_range (tuple): Min/max opacity values (residues that have higher frequency counts will be opaque) scale_range (tuple): Min/max size values (residues that have higher frequency counts will be bigger) """ # TODO: likely to move these functions to a separate nglview/utils folder since they are not coupled to the structure # TODO: add color by letter_annotations! if not chain: chain = self.mapped_chains if not chain: raise ValueError('Please input chain ID to display residue on') else: chain = ssbio.utils.force_list(chain) if isinstance(structure_resnums, dict): opacity_dict = ssbio.utils.scale_calculator(opacity_range[0], structure_resnums, rescale=opacity_range) scale_dict = ssbio.utils.scale_calculator(scale_range[0], structure_resnums, rescale=scale_range) else: opacity_dict = {x: max(opacity_range) for x in ssbio.utils.force_list(structure_resnums)} scale_dict = {x: max(scale_range) for x in ssbio.utils.force_list(structure_resnums)} if isinstance(structure_resnums, list): structure_resnums = list(set(structure_resnums)) elif isinstance(structure_resnums, dict): structure_resnums = list(structure_resnums.keys()) elif isinstance(structure_resnums, int): structure_resnums = ssbio.utils.force_list(structure_resnums) else: raise ValueError('Input must either be a list of residue numbers or a dictionary of residue numbers ' 'and their frequency.') colors = sns.color_palette("hls", len(structure_resnums)).as_hex() to_show_chains = '( ' for c in chain: to_show_chains += ':{} or'.format(c) to_show_chains = to_show_chains.strip(' or ') to_show_chains += ' )' for i, x in enumerate(structure_resnums): if isinstance(x, tuple): to_show_res = '( ' for mut in x: to_show_res += '{} or '.format(mut) to_show_res = to_show_res.strip(' or ') to_show_res += ' )' else: to_show_res = x log.info('Selection: {} and not hydrogen and {}'.format(to_show_chains, to_show_res)) if unique_colors: view.add_ball_and_stick(selection='{} and not hydrogen and {}'.format(to_show_chains, to_show_res), color=colors[i], opacity=opacity_dict[x], scale=scale_dict[x]) else: view.add_ball_and_stick(selection='{} and not hydrogen and {}'.format(to_show_chains, to_show_res), color=color, opacity=opacity_dict[x], scale=scale_dict[x]) def __json_decode__(self, **attrs): for k, v in attrs.items(): if k == 'chains': setattr(self, k, DictList(v)) else: setattr(self, k, v)
def __json_decode__(self, **attrs): for k, v in attrs.items(): if k == 'chains': setattr(self, k, DictList(v)) else: setattr(self, k, v)
class ATLAS(Object): """Class to represent an ATLAS workflow to carry out multi-strain comparisons Main steps are: #. Strain-specific model construction based on orthologous genes & systems modeling #. Phylogenetic analysis to pick out important genes #. GEM-PRO of the "base strain" #. Structure property calculation & integrated structural systems analysis Each step may generate a report and also request additional files if something is missing """ def __init__(self, atlas_name, root_dir, reference_gempro, reference_genome_path=None, description=None): """Prepare a GEM-PRO model for ATLAS analysis Args: atlas_name (str): Name of your ATLAS project root_dir (str): Path to where the folder named after ``atlas_name`` will be created. reference_gempro (GEMPRO): GEM-PRO model to use as the reference genome reference_genome_path (str): Path to reference genome FASTA file description (str): Optional string to describe your project """ Object.__init__(self, id=atlas_name, description=description) # Create directories self._root_dir = None self.root_dir = root_dir self.strains = DictList() self.df_orthology_matrix = pd.DataFrame() # Mark if the orthology matrix has gene IDs (thus we need to retrieve seqs from the genome file) or if # it is in the orthology matrix itself self._orthology_matrix_has_sequences = False # Load the GEM-PRO (could be a model, could just be a list of genes) # Check if there is a genome file associated with this model - if not, write all sequences and use that self.reference_gempro = reference_gempro if not reference_genome_path and not self.reference_gempro.genome_path: self.reference_gempro.genome_path = self.reference_gempro.write_representative_sequences_file( outname=self.reference_gempro.id) else: self.reference_gempro.genome_path = reference_genome_path # TODO: must also check if reference_genome_path gene IDs can be matched to the reference_gempro # Also create an attribute self._empty_reference_gempro = None if self.reference_gempro.model: # If there is a SBML model associated with the GEMPRO, copy that model self._empty_reference_gempro = GEMPRO( gem_name='Copied reference GEM-PRO', gem=self.reference_gempro.model.copy()) # Reset the GenePro attributes for x in self._empty_reference_gempro.genes: x.reset_protein() else: # Otherwise, just copy the list of genes over and rename the IDs strain_genes = [x.id for x in self.reference_gempro.genes] if len(strain_genes) == 0: raise ValueError( 'GEM-PRO has no genes, unable to run multi-strain analysis' ) self._empty_reference_gempro = GEMPRO( gem_name='Copied reference GEM-PRO', genes_list=strain_genes) @property def root_dir(self): """str: Directory where ATLAS project folder named after the attribute ``base_dir`` is located""" return self._root_dir @root_dir.setter def root_dir(self, path): if not path: raise ValueError('No path specified') if not op.exists(path): raise ValueError('{}: folder does not exist'.format(path)) if self._root_dir: log.info( 'Changing root directory of project "{}" from {} to {}'.format( self.id, self.root_dir, path)) if not op.exists(op.join(path, self.id)): raise IOError( 'Project "{}" does not exist in folder {}'.format( self.id, path)) else: log.info('Creating project directory in folder {}'.format(path)) self._root_dir = path for d in [ self.base_dir, self.model_dir, self.data_dir, self.sequences_dir, self.sequences_by_gene_dir, self.sequences_by_organism_dir ]: ssbio.utils.make_dir(d) log.info('{}: project location'.format(self.base_dir)) @property def base_dir(self): """str: ATLAS project folder""" if self.root_dir: return op.join(self.root_dir, self.id) else: return None @property def model_dir(self): """str: Directory where strain-specific GEMs are stored""" if self.base_dir: return op.join(self.base_dir, 'model') else: return None @property def data_dir(self): """str: Directory where all data (dataframes and more) will be stored""" if self.base_dir: return op.join(self.base_dir, 'data') else: return None @property def sequences_dir(self): """str: Base directory for genome protein sequences and alignments""" if self.base_dir: return op.join(self.base_dir, 'sequences') else: return None @property def sequences_by_gene_dir(self): """str: Directory where all gene specific information and pairwise alignments are stored""" if self.sequences_dir: return op.join(self.sequences_dir, 'by_gene') else: return None @property def sequences_by_organism_dir(self): """str: Directory where all strain specific genome and BLAST files are stored""" if self.sequences_dir: return op.join(self.sequences_dir, 'by_organism') else: return None # def _copy_reference_gempro(self, new_id): # """Copy the base strain GEM-PRO into a new GEM-PRO with a specified ID. # # Appends the model to the strains attribute. # # Args: # new_id (str): New ID to be assigned to the copied model # # Returns: # GEMPRO: copied GEM-PRO to represent the new strain # # """ # logging.disable(logging.WARNING) # if self.reference_gempro.model: # # If there is a SBML model associated with the GEMPRO, copy that model # copied_model = GEMPRO(gem_name=new_id, gem=self._model_to_copy.model.copy()) # copied_model.model.id = new_id # else: # # Otherwise, just copy the list of genes over and rename the IDs # strain_genes = [x.id for x in self._model_to_copy.genes] # copied_model = GEMPRO(gem_name=new_id, genes_list=strain_genes) # # Re-enable logging # logging.disable(logging.NOTSET) # # self.strains.append(copied_model) # log.debug('{}: new model copied from base model'.format(new_id)) # # return self.strains.get_by_id(new_id) def load_strain(self, strain_id, strain_genome_file): """Load a strain as a new GEM-PRO by its ID and associated genome file. Stored in the ``strains`` attribute. Args: strain_id (str): Strain ID strain_genome_file (str): Path to strain genome file """ logging.disable(logging.WARNING) strain_gp = GEMPRO(gem_name=strain_id, genome_path=strain_genome_file) logging.disable(logging.NOTSET) self.strains.append(strain_gp) return self.strains.get_by_id(strain_id) def download_patric_genomes(self, ids, force_rerun=False): """Download genome files from PATRIC given a list of PATRIC genome IDs and load them as strains. Args: ids (str, list): PATRIC ID or list of PATRIC IDs force_rerun (bool): If genome files should be downloaded again even if they exist """ ids = ssbio.utils.force_list(ids) counter = 0 log.info('Downloading sequences from PATRIC...') for patric_id in tqdm(ids): f = ssbio.databases.patric.download_coding_sequences( patric_id=patric_id, seqtype='protein', outdir=self.sequences_by_organism_dir, force_rerun=force_rerun) if f: self.load_strain(patric_id, f) counter += 1 log.debug('{}: downloaded sequence'.format(patric_id)) else: log.warning( '{}: unable to download sequence'.format(patric_id)) log.info( 'Created {} new strain GEM-PROs, accessible at "strains" attribute' .format(counter)) def get_orthology_matrix(self, pid_cutoff=None, bitscore_cutoff=None, evalue_cutoff=None, filter_condition='OR', remove_strains_with_no_orthology=True, remove_strains_with_no_differences=False, remove_genes_not_in_base_model=True): """Create the orthology matrix by finding best bidirectional BLAST hits. Genes = rows, strains = columns Runs run_makeblastdb, run_bidirectional_blast, and calculate_bbh for protein sequences. Args: pid_cutoff (float): Minimum percent identity between BLAST hits to filter for in the range [0, 100] bitscore_cutoff (float): Minimum bitscore allowed between BLAST hits evalue_cutoff (float): Maximum E-value allowed between BLAST hits filter_condition (str): 'OR' or 'AND', how to combine cutoff filters. 'OR' gives more results since it is less stringent, as you will be filtering for hits with (>80% PID or >30 bitscore or <0.0001 evalue). remove_strains_with_no_orthology (bool): Remove strains which have no orthologous genes found remove_strains_with_no_differences (bool): Remove strains which have all the same genes as the base model. Default is False because since orthology is found using a PID cutoff, all genes may be present but differences may be on the sequence level. remove_genes_not_in_base_model (bool): Remove genes from the orthology matrix which are not present in our base model. This happens if we use a genome file for our model that has other genes in it. Returns: DataFrame: Orthology matrix calculated from best bidirectional BLAST hits. """ # TODO: document and test other cutoffs # Get the path to the reference genome r_file = self.reference_gempro.genome_path bbh_files = {} log.info( 'Running bidirectional BLAST and finding best bidirectional hits (BBH)...' ) for strain_gempro in tqdm(self.strains): g_file = strain_gempro.genome_path # Run bidirectional BLAST log.debug('{} vs {}: Running bidirectional BLAST'.format( self.reference_gempro.id, strain_gempro.id)) r_vs_g, g_vs_r = ssbio.protein.sequence.utils.blast.run_bidirectional_blast( reference=r_file, other_genome=g_file, dbtype='prot', outdir=self.sequences_by_organism_dir) # Using the BLAST files, find the BBH log.debug('{} vs {}: Finding BBHs'.format(self.reference_gempro.id, strain_gempro.id)) bbh = ssbio.protein.sequence.utils.blast.calculate_bbh( blast_results_1=r_vs_g, blast_results_2=g_vs_r, outdir=self.sequences_by_organism_dir) bbh_files[strain_gempro.id] = bbh # Make the orthologous genes matrix log.info('Creating orthology matrix from BBHs...') ortho_matrix = ssbio.protein.sequence.utils.blast.create_orthology_matrix( r_name=self.reference_gempro.id, genome_to_bbh_files=bbh_files, pid_cutoff=pid_cutoff, bitscore_cutoff=bitscore_cutoff, evalue_cutoff=evalue_cutoff, filter_condition=filter_condition, outname='{}_{}_orthology.csv'.format(self.reference_gempro.id, 'prot'), outdir=self.data_dir) log.info( 'Saved orthology matrix at {}. See the "df_orthology_matrix" attribute.' .format(ortho_matrix)) self.df_orthology_matrix = pd.read_csv(ortho_matrix, index_col=0) # Filter the matrix to genes only in our analysis, and also check for strains with no differences or no orthologous genes self._filter_orthology_matrix( remove_strains_with_no_orthology=remove_strains_with_no_orthology, remove_strains_with_no_differences= remove_strains_with_no_differences, remove_genes_not_in_base_model=remove_genes_not_in_base_model) # def load_manual_orthology_matrix(self, df, clean_names=True, # remove_strains_with_no_orthology=True, # remove_strains_with_no_differences=False, # remove_genes_not_in_base_model=True): # """Load a manually curated orthology matrix to use in ATLAS. Genes = rows, strains = columns. # # Args: # df (DataFrame): Pandas DataFrame with genes as the rows and strains as the columns # clean_names (bool): Remove unwanted characters from gene names and strain IDs # remove_strains_with_no_orthology (bool): Remove strains which have no orthologous genes found # remove_strains_with_no_differences (bool): Remove strains which have all the same genes as the base model. # Default is False because since orthology is found using a PID cutoff, all genes may be present but # differences may be on the sequence level. # remove_genes_not_in_base_model (bool): Remove genes from the orthology matrix which are not present in our # base model. This happens if we use a genome file for our model that has other genes in it. # # """ # self._orthology_matrix_has_sequences = True # # if clean_names: # new_rows = [custom_slugify(x) for x in df.index] # new_cols = [custom_slugify(y) for y in df.columns] # df.index = new_rows # df.columns = new_cols # # self.df_orthology_matrix = df # # # Make the copies of the base model # for strain_id in tqdm(self.df_orthology_matrix.columns): # self._copy_reference_gempro(new_id=strain_id) # # # Filter the strains and orthology matrix # self._filter_orthology_matrix(remove_strains_with_no_orthology=remove_strains_with_no_orthology, # remove_strains_with_no_differences=remove_strains_with_no_differences, # remove_genes_not_in_base_model=remove_genes_not_in_base_model) def _filter_orthology_matrix(self, remove_strains_with_no_orthology=True, remove_strains_with_no_differences=False, remove_genes_not_in_base_model=True): """Filters the orthology matrix by removing genes not in our base model, and also removes strains from the analysis which have: 0 orthologous genes or no difference from the base strain. Args: remove_strains_with_no_orthology (bool): Remove strains which have no orthologous genes found remove_strains_with_no_differences (bool): Remove strains which have all the same genes as the base model. Default is False because since orthology is found using a PID cutoff, all genes may be present but differences may be on the sequence level. remove_genes_not_in_base_model (bool): Remove genes from the orthology matrix which are not present in our base model. This happens if we use a genome file for our model that has other genes in it. """ if len(self.df_orthology_matrix) == 0: raise RuntimeError('Empty orthology matrix') initial_num_strains = len(self.strains) # Adding names to the row and column of the orthology matrix self.df_orthology_matrix = self.df_orthology_matrix.rename_axis( 'gene').rename_axis("strain", axis="columns") # Gene filtering (of the orthology matrix) if remove_genes_not_in_base_model: # Check for gene IDs that are in the model and not in the orthology matrix # This is probably because: the CDS FASTA file for the base strain did not contain the correct ID # for the gene and consequently was not included in the orthology matrix # Save these and report them reference_strain_gene_ids = [ x.id for x in self.reference_gempro.genes ] self.missing_in_orthology_matrix = [ x for x in reference_strain_gene_ids if x not in self.df_orthology_matrix.index.tolist() ] self.missing_in_reference_strain = [ y for y in self.df_orthology_matrix.index.tolist() if y not in reference_strain_gene_ids ] # Filter the matrix for genes within our base model only self.df_orthology_matrix = self.df_orthology_matrix[ self.df_orthology_matrix.index.isin(reference_strain_gene_ids)] log.info( 'Filtered orthology matrix for genes present in base model') log.warning( '{} genes are in your base model but not your orthology matrix, see the attribute "missing_in_orthology_matrix"' .format(len(self.missing_in_orthology_matrix))) log.warning( '{} genes are in the orthology matrix but not your base model, see the attribute "missing_in_reference_strain"' .format(len(self.missing_in_reference_strain))) # Strain filtering for strain_gempro in self.strains.copy(): if remove_strains_with_no_orthology: if strain_gempro.id not in self.df_orthology_matrix.columns: self.strains.remove(strain_gempro) log.info( '{}: no orthologous genes found for this strain, removed from analysis.' .format(strain_gempro.id)) continue elif self.df_orthology_matrix[strain_gempro.id].isnull().all(): self.strains.remove(strain_gempro) log.info( '{}: no orthologous genes found for this strain, removed from analysis.' .format(strain_gempro.id)) continue if remove_strains_with_no_differences: not_in_strain = self.df_orthology_matrix[pd.isnull( self.df_orthology_matrix[strain_gempro.id])][ strain_gempro.id].index.tolist() if len(not_in_strain) == 0: self.strains.remove(strain_gempro) log.info( '{}: strain has no differences from the base, removed from analysis.' ) continue log.info('{} strains to be analyzed, {} strains removed'.format( len(self.strains), initial_num_strains - len(self.strains))) def _pare_down_model(self, strain_gempro, genes_to_remove): """Mark genes as non-functional in a GEM-PRO. If there is a COBRApy model associated with it, the COBRApy method delete_model_genes is utilized to delete genes. Args: strain_gempro (GEMPRO): GEMPRO object genes_to_remove (list): List of gene IDs to remove from the model """ # Filter out genes in genes_to_remove which do not show up in the model strain_genes = [x.id for x in strain_gempro.genes] genes_to_remove.extend(self.missing_in_orthology_matrix) genes_to_remove = list( set(genes_to_remove).intersection(set(strain_genes))) if len(genes_to_remove) == 0: log.info('{}: no genes marked non-functional'.format( strain_gempro.id)) return else: log.debug('{}: {} genes to be marked non-functional'.format( strain_gempro.id, len(genes_to_remove))) # If a COBRApy model exists, utilize the delete_model_genes method if strain_gempro.model: strain_gempro.model._trimmed = False strain_gempro.model._trimmed_genes = [] strain_gempro.model._trimmed_reactions = {} # Delete genes! cobra.manipulation.delete_model_genes(strain_gempro.model, genes_to_remove) if strain_gempro.model._trimmed: log.info('{}: marked {} genes as non-functional, ' 'deactivating {} reactions'.format( strain_gempro.id, len(strain_gempro.model._trimmed_genes), len(strain_gempro.model._trimmed_reactions))) # Otherwise, just mark the genes as non-functional else: for g in genes_to_remove: strain_gempro.genes.get_by_id(g).functional = False log.info('{}: marked {} genes as non-functional'.format( strain_gempro.id, len(genes_to_remove))) def _load_strain_sequences(self, strain_gempro): """Load strain sequences from the orthology matrix into the base model for comparisons, and into the strain-specific model itself. """ if self._orthology_matrix_has_sequences: # Load directly from the orthology matrix if it contains sequences strain_sequences = self.df_orthology_matrix[ strain_gempro.id].to_dict() else: # Otherwise load from the genome file if the orthology matrix contains gene IDs # Load the genome FASTA file log.debug('{}: loading strain genome CDS file'.format( strain_gempro.genome_path)) strain_sequences = SeqIO.index(strain_gempro.genome_path, 'fasta') for strain_gene in strain_gempro.genes: if strain_gene.functional: if self._orthology_matrix_has_sequences: strain_gene_key = strain_gene.id else: # Pull the gene ID of the strain from the orthology matrix strain_gene_key = self.df_orthology_matrix.loc[ strain_gene.id, strain_gempro.id] log.debug( '{}: original gene ID to be pulled from strain fasta file' .format(strain_gene_key)) # # Load into the base strain for comparisons ref_gene = self.reference_gempro.genes.get_by_id( strain_gene.id) new_id = '{}_{}'.format(strain_gene.id, strain_gempro.id) if ref_gene.protein.sequences.has_id(new_id): log.debug( '{}: sequence already loaded into reference model'. format(new_id)) continue ref_gene.protein.load_manual_sequence( seq=strain_sequences[strain_gene_key], ident=new_id, set_as_representative=False) log.debug( '{}: loaded sequence into reference model'.format(new_id)) # Load into the strain GEM-PRO strain_gene.protein.load_manual_sequence( seq=strain_sequences[strain_gene_key], ident=new_id, set_as_representative=True) log.debug( '{}: loaded sequence into strain model'.format(new_id)) def build_strain_specific_models(self, save_models=False): """Using the orthologous genes matrix, create and modify the strain specific models based on if orthologous genes exist. Also store the sequences directly in the reference GEM-PRO protein sequence attribute for the strains. """ if len(self.df_orthology_matrix) == 0: raise RuntimeError('Empty orthology matrix') # Create an emptied copy of the reference GEM-PRO for strain_gempro in tqdm(self.strains): log.debug('{}: building strain specific model'.format( strain_gempro.id)) # For each genome, load the metabolic model or genes from the reference GEM-PRO logging.disable(logging.WARNING) if self._empty_reference_gempro.model: strain_gempro.load_cobra_model( self._empty_reference_gempro.model) elif self._empty_reference_gempro.genes: strain_gempro.genes = [ x.id for x in self._empty_reference_gempro.genes ] logging.disable(logging.NOTSET) # Get a list of genes which do not have orthology in the strain not_in_strain = self.df_orthology_matrix[pd.isnull( self.df_orthology_matrix[strain_gempro.id])][ strain_gempro.id].index.tolist() # Mark genes non-functional self._pare_down_model(strain_gempro=strain_gempro, genes_to_remove=not_in_strain) # Load sequences into the base and strain models self._load_strain_sequences(strain_gempro=strain_gempro) if save_models: cobra.io.save_json_model( model=strain_gempro.model, filename=op.join(self.model_dir, '{}.json'.format(strain_gempro.id))) strain_gempro.save_pickle( op.join(self.model_dir, '{}_gp.pckl'.format(strain_gempro.id))) log.info( 'Created {} new strain-specific models and loaded in sequences'. format(len(self.strains))) def align_orthologous_genes_pairwise(self, gapopen=10, gapextend=0.5): """For each gene in the base strain, run a pairwise alignment for all orthologous gene sequences to it.""" for ref_gene in tqdm(self.reference_gempro.genes): if len(ref_gene.protein.sequences) > 1: alignment_dir = op.join(self.sequences_by_gene_dir, ref_gene.id) if not op.exists(alignment_dir): os.mkdir(alignment_dir) ref_gene.protein.pairwise_align_sequences_to_representative( gapopen=gapopen, gapextend=gapextend, outdir=alignment_dir, parse=True) def align_orthologous_genes_multiple(self): """For each gene in the base strain, run a multiple alignment to all orthologous strain genes""" pass def get_atlas_summary_df(self): """Create a single data frame which summarizes all genes per row. Returns: DataFrame: Pandas DataFrame of the results """ all_info = [] for g in self.reference_gempro.genes_with_a_representative_sequence: info = {} info['Gene_ID'] = g.id info['Gene_name'] = g.name # Protein object p = g.protein info['Protein_sequences'] = len(p.sequences) info['Protein_structures'] = len(p.structures) # SeqProp rseq = p.representative_sequence info['RepSeq_ID'] = rseq.id info['RepSeq_sequence_length'] = rseq.seq_len info['RepSeq_num_sequence_alignments'] = len([ x for x in p.sequence_alignments if x.annotations['ssbio_type'] == 'seqalign' ]) info['RepSeq_num_structure_alignments'] = len([ x for x in p.sequence_alignments if x.annotations['ssbio_type'] == 'structalign' ]) # SeqRecord annotations (properties calculated that summarize the whole sequence) for annotation_name, annotation in rseq.annotations.items(): info['RepSeq_' + annotation_name] = annotation # SeqRecord alignment annotations all_num_mutations = [] all_num_deletions = [] all_len_deletions = [] all_num_insertions = [] all_len_insertions = [] all_percent_identity = [] all_percent_similarity = [] for aln in p.sequence_alignments: # Gather the strain speicific stuff if '{}_'.format(p.id) not in aln.annotations['b_seq']: continue info[aln.annotations['b_seq'].split('{}_'.format( p.id))[1]] = aln.annotations['percent_identity'] # Gather the percent identities/similarities all_percent_identity.append( aln.annotations['percent_identity']) all_percent_similarity.append( aln.annotations['percent_similarity']) # Gather the number of residues that are mutated (filter for different mutations of same residue) num_mutations = len( list(set([x[1] for x in aln.annotations['mutations']]))) all_num_mutations.append(num_mutations) # Gather the number of deletions as well as the length of the deletion if not aln.annotations['deletions']: num_deletions = 0 len_deletions = [0] else: num_deletions = len(aln.annotations['deletions']) len_deletions = [ x[1] for x in aln.annotations['deletions'] ] all_num_deletions.append(num_deletions) # Get the total length of the deletion for this one strain avg_len_deletions = np.sum(len_deletions) all_len_deletions.append(avg_len_deletions) # Gather the number of insertions as well as the length of the insertion if not aln.annotations['insertions']: num_insertions = 0 len_insertions = [0] else: num_insertions = len(aln.annotations['insertions']) len_insertions = [ x[1] for x in aln.annotations['insertions'] ] all_num_insertions.append(num_insertions) # Get the total length of insertion for this one strain avg_len_insertions = np.sum(len_insertions) all_len_insertions.append(avg_len_insertions) info['ATLAS_mean_num_mutations'] = np.mean(all_num_mutations) info['ATLAS_mean_num_deletions'] = np.mean(all_num_deletions) info['ATLAS_mean_len_deletions'] = np.mean(all_len_deletions) info['ATLAS_mean_num_insertions'] = np.mean(all_num_insertions) info['ATLAS_mean_len_insertions'] = np.mean(all_len_insertions) info['ATLAS_mean_percent_identity'] = np.mean(all_percent_identity) info['ATLAS_mean_percent_similarity'] = np.mean( all_percent_similarity) # Other mutation analysis single, fingerprint = p.sequence_mutation_summary() # Mutations that show up in more than 10% of strains singles = [] for k, v in single.items(): k = [str(x) for x in k] if len(v) / len(p.sequence_alignments) >= 0.01: singles.append( ''.join(k) ) # len(v) is the number of strains which have this mutation info['ATLAS_popular_mutations'] = ';'.join(singles) # Mutation groups that show up in more than 10% of strains allfingerprints = [] for k, v in fingerprint.items(): if len(v) / len(p.sequence_alignments) >= 0.01: fingerprints = [] for m in k: y = [str(x) for x in m] fingerprints.append(''.join(y)) allfingerprints.append('-'.join(fingerprints)) info['ATLAS_popular_mutation_groups'] = ';'.join(allfingerprints) # StructProp rstruct = p.representative_structure if rstruct: if rstruct.structure_file: info['RepStruct_ID'] = rstruct.id info['RepStruct_is_experimental'] = rstruct.is_experimental info['RepStruct_description'] = rstruct.description info[ 'RepStruct_repseq_coverage'] = p.representative_chain_seq_coverage # ChainProp rchain = p.representative_chain info['RepChain_ID'] = rchain # ChainProp SeqRecord annotations rchain_sr = rstruct.chains.get_by_id(rchain).seq_record for annotation_name, annotation in rchain_sr.annotations.items( ): info['RepChain_' + annotation_name] = annotation all_info.append(info) cols = [ 'Gene_ID', 'Gene_name', 'Protein_sequences', 'Protein_structures', 'RepSeq_ID', 'RepSeq_sequence_length', 'RepSeq_num_sequence_alignments', 'RepSeq_num_structure_alignments', 'RepStruct_ID', 'RepChain_ID', 'RepStruct_description', 'RepStruct_is_experimental', 'RepStruct_repseq_coverage', 'ATLAS_mean_percent_identity', 'ATLAS_mean_percent_similarity', 'ATLAS_mean_num_mutations', 'ATLAS_popular_mutations', 'ATLAS_popular_mutation_groups', 'ATLAS_mean_num_deletions', 'ATLAS_mean_num_insertions', 'ATLAS_mean_len_deletions', 'ATLAS_mean_len_insertions', 'RepSeq_aromaticity', 'RepSeq_instability_index', 'RepSeq_isoelectric_point', 'RepSeq_molecular_weight', 'RepSeq_monoisotopic', 'RepSeq_num_tm_helix-tmhmm', 'RepSeq_percent_acidic', 'RepSeq_percent_aliphatic', 'RepSeq_percent_aromatic', 'RepSeq_percent_B-sspro8', 'RepSeq_percent_basic', 'RepSeq_percent_buried-accpro', 'RepSeq_percent_buried-accpro20', 'RepSeq_percent_C-sspro', 'RepSeq_percent_C-sspro8', 'RepSeq_percent_charged', 'RepSeq_percent_E-sspro', 'RepSeq_percent_E-sspro8', 'RepSeq_percent_exposed-accpro', 'RepSeq_percent_exposed-accpro20', 'RepSeq_percent_G-sspro8', 'RepSeq_percent_H-sspro', 'RepSeq_percent_H-sspro8', 'RepSeq_percent_helix_naive', 'RepSeq_percent_I-sspro8', 'RepSeq_percent_non-polar', 'RepSeq_percent_polar', 'RepSeq_percent_S-sspro8', 'RepSeq_percent_small', 'RepSeq_percent_strand_naive', 'RepSeq_percent_T-sspro8', 'RepSeq_percent_tiny', 'RepSeq_percent_turn_naive', 'RepChain_percent_B-dssp', 'RepChain_percent_C-dssp', 'RepChain_percent_E-dssp', 'RepChain_percent_G-dssp', 'RepChain_percent_H-dssp', 'RepChain_percent_I-dssp', 'RepChain_percent_S-dssp', 'RepChain_percent_T-dssp', 'RepChain_SSBOND-biopython' ] cols.extend([x.id for x in self.strains]) df_atlas_summary = pd.DataFrame(all_info, columns=cols) # Drop columns that don't have anything in them df_atlas_summary.dropna(axis=1, how='all', inplace=True) return df_atlas_summary def get_atlas_per_gene_mutation_df(self, gene_id): """Create a single data frame which summarizes a gene and its mutations. Args: gene_id (str): Gene ID in the base model Returns: DataFrame: Pandas DataFrame of the results """ # TODO: also count: number of unique mutations (have to consider position, amino acid change) # TODO: keep track of strain with most mutations, least mutations # TODO: keep track of strains that conserve the length of the protein, others that extend or truncate it # need statistical test for that too (how long is "extended"/"truncated"?) # TODO: number of strains with at least 1 mutations # TODO: number of strains with <5% mutated, 5-10%, etc g = self.reference_gempro.genes.get_by_id(gene_id) single, fingerprint = g.protein.sequence_mutation_summary( alignment_type='seqalign') structure_type_suffix = 'NA' appender = [] for k, strains in single.items(): # Mutations in the strain to_append = {} orig_res = k[0] resnum = int(k[1]) mutated_res = k[2] num_strains_mutated = len(strains) strain_ids = [str(x.split(g.id + '_')[1]) for x in strains] to_append['ref_residue'] = orig_res to_append['ref_resnum'] = resnum to_append['strain_residue'] = mutated_res to_append['num_strains_mutated'] = num_strains_mutated to_append['strains_mutated'] = ';'.join(strain_ids) to_append['at_disulfide_bridge'] = False # Residue properties origres_props = ssbio.protein.sequence.properties.residues.residue_biochemical_definition( orig_res) mutres_props = ssbio.protein.sequence.properties.residues.residue_biochemical_definition( mutated_res) to_append['ref_residue_prop'] = origres_props to_append['strain_residue_prop'] = mutres_props # Grantham score - score a mutation based on biochemical properties grantham_s, grantham_txt = ssbio.protein.sequence.properties.residues.grantham_score( orig_res, mutated_res) to_append['grantham_score'] = grantham_s to_append['grantham_annotation'] = grantham_txt # Get all per residue annotations - predicted from sequence and calculated from structure to_append.update( g.protein.get_residue_annotations(seq_resnum=resnum, use_representatives=True)) # Check structure type if g.protein.representative_structure: if g.protein.representative_structure.is_experimental: to_append['structure_type'] = 'EXP' else: to_append['structure_type'] = 'HOM' # At disulfide bond? repchain = g.protein.representative_chain repchain_annotations = g.protein.representative_structure.chains.get_by_id( repchain).seq_record.annotations if 'SSBOND-biopython' in repchain_annotations: structure_resnum = g.protein.map_seqprop_resnums_to_structprop_resnums( resnums=resnum, use_representatives=True) if resnum in structure_resnum: ssbonds = repchain_annotations['SSBOND-biopython'] ssbonds_res = [] for x in ssbonds: ssbonds_res.append(x[0]) ssbonds_res.append(x[1]) if structure_resnum in ssbonds_res: to_append['at_disulfide_bridge'] = True appender.append(to_append) if not appender: return pd.DataFrame() cols = [ 'ref_residue', 'ref_resnum', 'strain_residue', 'num_strains_mutated', 'strains_mutated', 'ref_residue_prop', 'strain_residue_prop', 'grantham_score', 'grantham_annotation', 'at_disulfide_bridge', 'seq_SS-sspro', 'seq_SS-sspro8', 'seq_RSA-accpro', 'seq_RSA-accpro20', 'seq_TM-tmhmm', 'struct_SS-dssp', 'struct_RSA-dssp', 'struct_ASA-dssp', 'struct_CA_DEPTH-msms', 'struct_RES_DEPTH-msms', 'struct_PHI-dssp', 'struct_PSI-dssp', 'struct_resnum', 'struct_residue' 'strains_mutated' ] df_gene_summary = pd.DataFrame.from_records(appender, columns=cols) # Drop columns that don't have anything in them df_gene_summary.dropna(axis=1, how='all', inplace=True) df_gene_summary.sort_values(by='ref_resnum', inplace=True) df_gene_summary = df_gene_summary.set_index('ref_resnum') return df_gene_summary def download_mutation_images(self): # TODO: dunno if this works import ipywidgets import math views = [] for g in self.reference_gempro.genes: if g.protein.representative_structure: view = g.protein.view_all_mutations(alignment_type='seqalign', grouped=False, structure_opacity=0.5, opacity_range=(0.6, 1), scale_range=(.5, 5)) view._remote_call("setSize", target='Widget', args=['300px', '300px']) view.download_image( filename='{}_{}_mutations.png'.format(g.id, g.name)) views.append(view) hboxes = [ ipywidgets.HBox(views[i * 3:i * 3 + 3]) for i in range(int(math.ceil(len(views) / 3.0))) ] vbox = ipywidgets.VBox(hboxes) return vbox
def compare_reactions(reaction1, reaction2, details=None, id1='first', id2='second'): """ Compare two lists of cobra.core.Reaction objects and report differences. To determine if two reactions are the same, the function compares the following attributes: (1) ID {'reaction_id'}, (2) name {'reaction_name'}, (3) bounds {'reaction_bounds'}, (4) definition {'reaction_definition'}, (5) gene reaction rule {'reaction_gpr'}. Include the value in {} in the details parameter to display the details of reactions where the values are different. Parameters ---------- reaction1 : cobra.core.DictList First list of cobra.core.Reaction objects to analyze reaction2 : cobra.core.DictList Second list of cobra.core.Reaction objects to analyze details : set, optional When specified, print details on given types of differences id1 : str, optional ID for labeling first list of reactions id2 : str, optional ID for labeling second list of reactions """ if details is None: details = set() print('REACTIONS\n' + '---------') print('{0} reactions in {1}'.format(len(reaction1), id1)) print('{0} reactions in {1}\n'.format(len(reaction2), id2)) # See if reactions from first model are in the second model. num_matched = 0 reaction_only_in_one = DictList() different_name = DictList() different_bounds = DictList() different_definition = DictList() different_genes = DictList() for r1 in reaction1: try: r2 = reaction2.get_by_id(r1.id) num_matched += 1 if r1.name != r2.name: different_name.append(r1) if r1.bounds != r2.bounds: different_bounds.append(r1) if r1.reaction != r2.reaction: different = False for met, coefficient in iteritems(r1.metabolites): if not isclose(r2.get_coefficient(met.id), coefficient): different = True if different: different_definition.append(r1) if r1.gene_reaction_rule != r2.gene_reaction_rule: different_genes.append(r1) except KeyError: reaction_only_in_one.append(r1) print('{0} reactions in {1} and {2}'.format(num_matched, id1, id2)) print('{0} reactions only in {1}\n'.format(len(reaction_only_in_one), id1)) # If requested, show the details on reactions only in the first model. if 'reaction_id' in details and len(reaction_only_in_one) > 0: reaction_only_in_one.sort(key=lambda x: x.id) output = [[rxn.id, format_long_string(rxn.name, 20), rxn.reaction] for rxn in reaction_only_in_one] print( tabulate(output, tablefmt='simple', headers=reaction_header) + '\n') # See if reactions from second model are in the first model. num_matched = 0 reaction_only_in_two = DictList() for r2 in reaction2: if reaction1.has_id(r2.id): num_matched += 1 else: reaction_only_in_two.append(r2) print('{0} reactions in both {1} and {2}'.format(num_matched, id1, id2)) print('{0} reactions only in {1}\n'.format(len(reaction_only_in_two), id2)) # If requested, show the details on reactions only in the second model. if 'reaction_id' in details and len(reaction_only_in_two) > 0: reaction_only_in_two.sort(key=lambda x: x.id) output = [[rxn.id, format_long_string(rxn.name, 20), rxn.reaction] for rxn in reaction_only_in_two] print('\n' + tabulate(output, tablefmt='simple', headers=reaction_header) + '\n') # Display details on reaction attribute differences. print('{0} reactions with different names'.format(len(different_name))) if 'reaction_name' in details and len(different_name) > 0: different_name.sort(key=lambda x: x.id) output = [[rxn.id, rxn.name, reaction2.get_by_id(rxn.id).name] for rxn in different_name] print('\n' + tabulate(output, tablefmt='simple', headers=difference_header) + '\n') print('{0} reactions with different bounds'.format(len(different_bounds))) if 'reaction_bounds' in details and len(different_bounds) > 0: different_bounds.sort(key=lambda x: x.id) output = [[rxn.id, rxn.bounds, reaction2.get_by_id(rxn.id).bounds] for rxn in different_bounds] print('\n' + tabulate(output, tablefmt='simple', headers=difference_header) + '\n') print('{0} reactions with different definitions'.format( len(different_definition))) if 'reaction_definition' in details and len(different_definition) > 0: different_definition.sort(key=lambda x: x.id) output = [[rxn.id, rxn.reaction, reaction2.get_by_id(rxn.id).reaction] for rxn in different_definition] print('\n' + tabulate(output, tablefmt='simple', headers=difference_header) + '\n') print('{0} reactions with different genes'.format(len(different_genes))) if 'reaction_gpr' in details and len(different_genes) > 0: different_genes.sort(key=lambda x: x.id) output = [[ rxn.id, rxn.gene_reaction_rule, reaction2.get_by_id(rxn.id).gene_reaction_rule ] for rxn in different_genes] print('\n' + tabulate(output, tablefmt='simple', headers=difference_header) + '\n') return
def compare_genes(gene1, gene2, details=None, id1='first', id2='second'): """ Compare two lists of cobra.core.Gene objects and report differences. To determine if two genes are the same, the function compares the following attributes: (1) ID {'gene_id'}, (2) name {'gene_name'}. Include the value in {} in the details parameter to display the details of genes where the values are different. Parameters ---------- gene1 : cobra.core.DictList First list of cobra.core.Gene objects to analyze gene2 : cobra.core.DictList Second list of cobra.core.Gene objects to analyze details : set, optional When specified, print details on given types of differences id1 : str, optional ID for labeling first list of genes id2 : str, optional ID for labeling second list of genes """ if details is None: details = set() print('\nGENES\n' + '------') print('{0} genes in {1}'.format(len(gene1), id1)) print('{0} genes in {1}\n'.format(len(gene2), id2)) # See if genes from first list are in the second list. num_matched = 0 gene_only_in_one = DictList() different_name = DictList() for g1 in gene1: try: g2 = gene2.get_by_id(g1.id) num_matched += 1 if g1.name.lower() != g2.name.lower(): different_name.append(g1) except KeyError: gene_only_in_one.append(g1) print('{0} genes in both {1} and {2}'.format(num_matched, id1, id2)) print('{0} genes only in {1}\n'.format(len(gene_only_in_one), id1)) if 'gene_id' in details and len(gene_only_in_one) > 0: gene_only_in_one.sort(key=lambda x: x.id) output = [[gene.id, format_long_string(gene.name, 90)] for gene in gene_only_in_one] print('\n' + tabulate(output, tablefmt='simple', headers=gene_header) + '\n') # See if genes from second list are in the first list. num_matched = 0 gene_only_in_two = DictList() for g2 in gene2: if gene1.has_id(g2.id): num_matched += 1 else: gene_only_in_two.append(g2) print('{0} genes in both {1} and {2}'.format(num_matched, id1, id2)) print('{0} genes only in {1}\n'.format(len(gene_only_in_two), id2)) if 'gene_id' in details and len(gene_only_in_two) > 0: gene_only_in_two.sort(key=lambda x: x.id) output = [[gene.id, format_long_string(gene.name, 90)] for gene in gene_only_in_two] print('\n' + tabulate(output, tablefmt='simple', headers=gene_header) + '\n') # Display details on gene attribute differences. print('{0} genes with different names'.format(len(different_name))) if 'gene_name' in details and len(different_name) > 0: different_name.sort(key=lambda x: x.id) output = [[gene.id, gene.name, gene2.get_by_id(gene.id).name] for gene in different_name] print('\n' + tabulate(output, tablefmt='simple', headers=difference_header) + '\n') return
def compare_metabolites(metabolite1, metabolite2, details=None, id1='first', id2='second'): """ Compare two lists of cobra.core.Metabolite objects and report differences. To determine if two metabolites are the same, the function compares the following attributes: (1) ID {'metabolite_id'}, (2) name {'metabolite_name'}, (3) formula {'metabolite_formula'}, (4) charge {'metabolite_charge'}, (5) compartment {'metabolite_compartment'}. Include the value in {} in the details parameter to display the details of metabolites where the values are different. Parameters ---------- metabolite1 : cobra.core.DictList First list of cobra.core.Metabolite objects to analyze metabolite2 : cobra.core.DictList Second list of cobra.core.Metabolite objects to analyze details : set, optional When specified, print details on given types of differences id1 : str, optional ID for labeling first list of metabolites id2 : str, optional ID for labeling second list of metabolites """ if details is None: details = set() print('\nMETABOLITES\n' + '-----------') print('{0} metabolites in {1}'.format(len(metabolite1), id1)) print('{0} metabolites in {1}\n'.format(len(metabolite2), id2)) # See if metabolites from first model are in the second model. num_matched = 0 metabolite_only_in_one = DictList() different_name = DictList() different_formula = DictList() different_charge = DictList() different_compartment = DictList() for m1 in metabolite1: try: m2 = metabolite2.get_by_id(m1.id) num_matched += 1 if m1.name != m2.name: different_name.append(m1) if m1.formula != m2.formula: different_formula.append(m1) if m1.charge != m2.charge: different_charge.append(m1) if m1.compartment != m2.compartment: different_compartment.append(m1) except KeyError: metabolite_only_in_one.append(m1) print('{0} metabolites in both {1} and {2}'.format(num_matched, id1, id2)) print('{0} metabolites only in {1}\n'.format(len(metabolite_only_in_one), id1)) if 'metabolite_id' in details and len(metabolite_only_in_one) > 0: metabolite_only_in_one.sort(key=lambda x: x.id) output = [[met.id, format_long_string(met.name, 70)] for met in metabolite_only_in_one] print('\n' + tabulate(output, tablefmt='simple', headers=metabolite_header) + '\n') # See if metabolites from second model are in the first model. num_matched = 0 metabolite_only_in_two = DictList() for m2 in metabolite2: if metabolite1.has_id(m2.id): num_matched += 1 else: metabolite_only_in_two.append(m2) print('{0} metabolites in both {1} and {2}'.format(num_matched, id1, id2)) print('{0} metabolites only in {1}\n'.format(len(metabolite_only_in_two), id2)) if 'metabolite_id' in details and len(metabolite_only_in_two) > 0: metabolite_only_in_two.sort(key=lambda x: x.id) output = [[met.id, format_long_string(met.name, 70)] for met in metabolite_only_in_two] print('\n' + tabulate(output, tablefmt='simple', headers=metabolite_header) + '\n') # Display details on metabolite attribute differences. print('{0} metabolites with different names'.format(len(different_name))) if 'metabolite_name' in details and len(different_name) > 0: different_name.sort(key=lambda x: x.id) output = [[met.id, met.name, metabolite2.get_by_id(met.id).name] for met in different_name] print('\n' + tabulate(output, tablefmt='simple', headers=difference_header) + '\n') print('{0} metabolites with different formulas'.format( len(different_formula))) if 'metabolite_formula' in details and len(different_formula) > 0: different_formula.sort(key=lambda x: x.id) output = [[met.id, met.formula, metabolite2.get_by_id(met.id).formula] for met in different_formula] print('\n' + tabulate(output, tablefmt='simple', headers=difference_header) + '\n') print('{0} metabolites with different charges'.format( len(different_charge))) if 'metabolite_charge' in details and len(different_charge) > 0: different_charge.sort(key=lambda x: x.id) output = [[met.id, met.charge, metabolite2.get_by_id(met.id).charge] for met in different_charge] print('\n' + tabulate(output, tablefmt='simple', headers=difference_header) + '\n') print('{0} metabolites with different compartments'.format( len(different_compartment))) if 'metabolite_compartment' in details and len(different_compartment) > 0: different_compartment.sort(key=lambda x: x.id) output = [[ met.id, met.compartment, metabolite2.get_by_id(met.id).compartment ] for met in different_compartment] print('\n' + tabulate(output, tablefmt='simple', headers=difference_header) + '\n') return
def _build_ensemble_from_gapfill_solutions(model, solutions, universal=None): ensemble = Ensemble(identifier=model.id, name=model.name) ensemble.base_model = model.copy() # generate member identifiers for each solution # Convert the solution to a dictlist so we can retrieve reactions by id solution_dict = {} i = 0 for solution in solutions: solution_id = model.id + '_gapfilled_' + str(i) solution_as_rxn_objs = [ universal.reactions.get_by_id(rxn).copy() for rxn in solution ] solution_dict[solution_id] = DictList() + solution_as_rxn_objs i += 1 # scan through other members and remove them if they are identical. # as long as we're looping, we'll also find reactions that are in # all solutions and get the list of reactions that are in any ensemble. # first, convert the solution dictionary to the same structure except with # reaction ids rather than reaction objects so that we can perform set # operations solutions_as_ids = {} for member_id in solution_dict.keys(): solutions_as_ids[member_id] = [ rxn.id for rxn in solution_dict[member_id] ] used_members = [] duplicate_solutions = [] all_reactions = set() in_all = set() for member_id in solutions_as_ids.keys(): used_members.append(member_id) member_solution = set(solutions_as_ids[member_id]) if in_all: in_all = member_solution & in_all #if this is the first ensemble member, set intersection will fail # because of the empty set, so we need this exception else: in_all = member_solution all_reactions = all_reactions | member_solution for other_member in solutions_as_ids.keys(): if other_member not in used_members: other_solution = solutions_as_ids[other_member] if set(other_solution) == member_solution: duplicate_solutions.append(other_member) used_members.append(other_member) # perform the removal of duplicate solutions on the original solution # object which contains reaction objects rather than reaction ids as # strings for duplicate in duplicate_solutions: solution_dict.pop(duplicate, None) # Reactions that need features are those that were not in all the gapfill # solutions. reactions_needing_features = list(all_reactions - in_all) reactions_needing_features_objs = [ universal.reactions.get_by_id(rxn).copy() for rxn in reactions_needing_features ] # add reaction objects to the base model for all reactions all_reactions_as_objects = [ universal.reactions.get_by_id(rxn).copy() for rxn in all_reactions ] ensemble.base_model.add_reactions(all_reactions_as_objects) # add metabolite objects to the base model for all new metabolites from # the new reactions mets = [x.metabolites for x in all_reactions_as_objects] all_keys = set().union(*(d.keys() for d in mets)) ensemble.base_model.add_metabolites(all_keys) print('building features...') # generate features for the reactions that vary across solutions and add # them to the ensemble. assume that all reactions have the same attribute # values; if different attribute values are desired for reactions with the # same ID, these need to be added to the universal reaction bag prior to # gapfilling features = DictList() for reaction in reactions_needing_features_objs: for attribute in REACTION_ATTRIBUTES: identifier = reaction.id + "_" + attribute name = reaction.name feature = Feature(identifier, name) feature.ensemble = ensemble feature.base_component = (ensemble.base_model.reactions.get_by_id( reaction.id)) feature.component_attribute = attribute # get the states for this feature as {member.id:value} states = {} for member_id in solution_dict.keys(): # get the reaction and it's value for the attribute if reaction.id in [rxn.id for rxn in solution_dict[member_id]]: rxn_obj = solution_dict[member_id].get_by_id(reaction.id) states[member_id] = getattr(rxn_obj, attribute) else: states[member_id] = MISSING_ATTRIBUTE_DEFAULT[attribute] feature.states = states features += [feature] ensemble.features = features print('updating members...') # update members for the ensemble members = DictList() for member_id in solution_dict.keys(): model_states = dict() for feature in ensemble.features: model_states[feature] = feature.get_model_state(member_id) member = Member(ensemble=ensemble,\ identifier=member_id,\ name=ensemble.name,\ states=model_states) members += [member] ensemble.members = members return ensemble