예제 #1
0
파일: bigg.py 프로젝트: bandjay/cobrababel
def add_bigg_metabolites(bigg_list, model):
    """ Create a COBRA metabolite from a BiGG metabolite.

    Parameters
    ----------
    bigg_list: list of dict
        List of dictionaries with BiGG metabolite data
    model: cobra.core.Model
        Model to add metabolites to
    """

    # Create a Metabolite object for each BiGG metabolite.
    metabolites = DictList()
    for bigg_metabolite in bigg_list:
        # Available data is different for a metabolite from an organism model versus
        # a metabolite from the universal model.
        if 'compartment_bigg_id' in bigg_metabolite:
            compartment = bigg_metabolite['compartment_bigg_id']
        elif 'compartments_in_models' in bigg_metabolite:
            compartment = bigg_metabolite['compartments_in_models'][0][
                'bigg_id']
        else:
            raise ValueError(
                'BiGG metabolite {0} does not have a compartment'.format(
                    bigg_metabolite['bigg_id']))
        metabolite = Metabolite(id='{0}_{1}'.format(bigg_metabolite['bigg_id'],
                                                    compartment),
                                name=bigg_metabolite['name'],
                                compartment=compartment)
        try:
            metabolite.formula = bigg_metabolite['formula']
        except KeyError:
            try:
                if len(bigg_metabolite['formulae']) > 0:
                    metabolite.formula = bigg_metabolite['formulae'][0]
            except KeyError:
                pass
        try:
            metabolite.charge = bigg_metabolite['charge']
        except KeyError:
            try:
                if len(bigg_metabolite['charges']) > 0:
                    metabolite.charge = bigg_metabolite['charges'][0]
            except KeyError:
                pass
        if len(bigg_metabolite['database_links']) > 0:
            metabolite.notes['aliases'] = bigg_metabolite['database_links']
        metabolites.append(metabolite)

        if compartment not in model.compartments:
            try:
                model.compartments[compartment] = bigg_metabolite[
                    'compartment_name']
            except KeyError:
                model.compartments[compartment] = 'unknown'

    # Add all of the metabolites to the model.
    model.add_metabolites(metabolites)
    return
예제 #2
0
def test_independent():
    a = DictList([Object("o1"), Object("o2")])
    b = DictList()
    assert "o1" in a
    assert "o1" not in b
    b.append(Object("o3"))
    assert "o3" not in a
    assert "o3" in b
예제 #3
0
def test_independent():
    a = DictList([Object("o1"), Object("o2")])
    b = DictList()
    assert "o1" in a
    assert "o1" not in b
    b.append(Object("o3"))
    assert "o3" not in a
    assert "o3" in b
예제 #4
0
    def subunits(self):
        """DictList: Subunits represented as a DictList of Protein objects"""

        # TODO: [VizRecon]
        # TODO: will need to adapt this to allow for input of previously created Protein objects

        subunits = DictList()

        for s in self.subunit_dict:
            subunits.append(
                Protein(ident=s,
                        description='Subunit of complex {}'.format(self.id),
                        root_dir=self.complex_dir,
                        pdb_file_type=self.pdb_file_type))

        return subunits
예제 #5
0
def filter_out_spontaneous_genes(genes, custom_spont_id=None):
    """Return the DictList of genes that are not spontaneous in a model.

    Args:
        genes (DictList): Genes DictList
        custom_spont_id (str): Optional custom spontaneous ID if it does not match the regular expression ``[Ss](_|)0001``

    Returns:
        DictList: genes excluding ones that are spontaneous

    """
    new_genes = DictList()
    for gene in genes:
        if not is_spontaneous(gene, custom_id=custom_spont_id):
            new_genes.append(gene)

    return new_genes
예제 #6
0
    def filter_reaction_by_subsystems(self):
        subsystem2reactions = {}
        for reaction in self.reactions:
            subsystem2reactions.setdefault(reaction.subsystem, [])
            subsystem2reactions[reaction.subsystem].append(reaction)

        fva_reactions = DictList()
        for subsys, reactions in subsystem2reactions.items():
            reactions = sorted(reactions,
                               key=lambda x: sum([
                                   len([r for r in m.reactions if r != x])
                                   for m in x.metabolites
                               ]),
                               reverse=True)
            for i in range(3):
                if i == len(reactions):
                    break
                fva_reactions.append(reactions[i])
        return fva_reactions
예제 #7
0
def add_bigg_reactions(bigg_list, model, ignore_pseudo_reactions=True):
    """ Create a COBRA reaction from a BiGG reaction.

    Parameters
    ----------
    bigg_list : list of dict
        List of dictionaries with BiGG reaction data
    model : cobra.core.Model
        Model to add reactions to
    ignore_pseudo_reactions : bool, optional
        When True, do not include pseudo reactions
    """

    # Create a Reaction object for each BiGG reaction.
    reactions = DictList()
    for bigg_reaction in bigg_list:
        if bigg_reaction['pseudoreaction'] and ignore_pseudo_reactions:
            continue
        reaction = Reaction(id=bigg_reaction['bigg_id'],
                            name=bigg_reaction['name'])
        reaction.notes['aliases'] = bigg_reaction['database_links']
        metabolites = dict()
        for met in bigg_reaction['metabolites']:
            metabolite = model.metabolites.get_by_id('{0}_{1}'.format(
                met['bigg_id'], met['compartment_bigg_id']))
            metabolites[metabolite] = met['stoichiometry']
        reaction.add_metabolites(metabolites)
        try:
            reaction.bounds = (bigg_reaction['results'][0]['lower_bound'],
                               bigg_reaction['results'][0]['upper_bound'])
        except KeyError:
            if '&#8652' in bigg_reaction['reaction_string']:
                reaction.bounds = (-1000.0, 1000.0)
            else:
                warn('Unknown direction symbol in reaction string {0}'.format(
                    bigg_reaction['reaction_string']))
        reactions.append(reaction)

    # Add all of the reactions to the model.
    model.add_reactions(reactions)
    return
예제 #8
0
class StructProp(Object):

    """Generic class to represent information for a protein structure.

    The main utilities of this class are to:

    * Provide access to the 3D coordinates using a Biopython Structure object through the method ``parse_structure``.
    * Run predictions and computations on the structure
    * Analyze specific chains using the ``mapped_chains`` attribute
    * Provide wrapper methods to ``nglview`` to view the structure in a Jupyter notebook

    Args:
        ident (str): Unique identifier for this structure
        description (str): Optional human-readable description
        chains (str, list): Chain ID or list of IDs
        mapped_chains (str, list): A chain ID or IDs to indicate what chains should be analyzed
        is_experimental (bool): Flag to indicate if structure is an experimental or computational model
        structure_path (str): Path to structure file
        file_type (str): Type of structure file - ``pdb``, ``pdb.gz``, ``mmcif``, ``cif``, ``cif.gz``,
            ``xml.gz``, ``mmtf``, ``mmtf.gz``

    """

    def __init__(self, ident, description=None, chains=None, mapped_chains=None,
                 is_experimental=False, structure_path=None, file_type=None):
        Object.__init__(self, id=ident, description=description)

        self.is_experimental = is_experimental
        """bool: Flag to note if this structure is an experimental model or a homology model"""

        # Chain information
        # chains is a DictList of ChainProp objects
        # If you run self.parse_structure(), all chains will be parsed and stored here
        # Use mapped_chains below to keep track of chains you are interested in
        self.chains = DictList()
        """DictList: A DictList of chains have their sequence stored in them, along with residue-specific"""
        if chains:
            self.add_chain_ids(chains)
        # mapped_chains is an ordered list of mapped chain IDs which would come from BLAST or the best_structures API
        self.mapped_chains = []
        """list: A simple list of chain IDs (strings) that will be used to subset analyses"""
        if mapped_chains:
            self.add_mapped_chain_ids(mapped_chains)

        self.parsed = False
        """bool: Simple flag to track if this structure has had its structure + chain sequences parsed"""
        # XTODO: rename to sequence_parsed or something similar

        # File information
        self.file_type = file_type
        """str: Type of structure file"""
        self._structure_dir = None
        self.structure_file = None
        """str: Name of the structure file"""
        if structure_path:
            self.load_structure_path(structure_path, file_type)

        self.structure = None
        """Structure: Biopython Structure object, only used if ``store_in_memory`` option of ``parse_structure`` is set to True"""

    @property
    def structure_dir(self):
        if not self._structure_dir:
            raise OSError('No structure folder set')
        return self._structure_dir

    @structure_dir.setter
    def structure_dir(self, path):
        if path and not op.exists(path):
            raise OSError('{}: folder does not exist'.format(path))

        self._structure_dir = path

    @property
    def structure_path(self):
        if not self.structure_file:
            raise OSError('{}: structure file not available'.format(self.id))

        path = op.join(self.structure_dir, self.structure_file)
        if not op.exists(path):
            raise ValueError('{}: file does not exist'.format(path))
        return path

    def load_structure_path(self, structure_path, file_type):
        """Load a structure file and provide pointers to its location

        Args:
            structure_path (str): Path to structure file
            file_type (str): Type of structure file

        """

        if not file_type:
            raise ValueError('File type must be specified')

        self.file_type = file_type
        self.structure_dir = op.dirname(structure_path)
        self.structure_file = op.basename(structure_path)

    def parse_structure(self, store_in_memory=False):
        """Read the 3D coordinates of a structure file and return it as a Biopython Structure object.
        Also create ChainProp objects in the chains attribute for each chain in the first model.

        Args:
            store_in_memory (bool): If the Biopython Structure object should be stored in the attribute ``structure``.

        Returns:
            Structure: Biopython Structure object

        """
        # TODO: perhaps add option to parse into ProDy object?
        if not self.structure_file:
            log.error('{}: no structure file, unable to parse'.format(self.id))
            return None
        else:
            # Add Biopython structure object
            structure = StructureIO(self.structure_path, self.file_type)

            # Add all chains to self.chains as ChainProp objects
            structure_chains = [x.id for x in structure.first_model.child_list]
            self.add_chain_ids(structure_chains)
            self.get_structure_seqs(structure.first_model)

            # Also add all chains to self.mapped_chains ONLY if there are none specified
            if not self.mapped_chains:
                self.add_mapped_chain_ids(structure_chains)

            self.parsed = True

            if store_in_memory:
                self.structure = structure

            return structure

    def clean_structure(self, out_suffix='_clean', outdir=None, force_rerun=False,
                        remove_atom_alt=True, keep_atom_alt_id='A',remove_atom_hydrogen=True,  add_atom_occ=True,
                        remove_res_hetero=True, keep_chemicals=None, keep_res_only=None,
                        add_chain_id_if_empty='X', keep_chains=None):
        """Clean the structure file associated with this structure, and save it as a new file. Returns the file path.

        Args:
            out_suffix (str): Suffix to append to original filename
            outdir (str): Path to output directory
            force_rerun (bool): If structure should be re-cleaned if a clean file exists already
            remove_atom_alt (bool): Remove alternate positions
            keep_atom_alt_id (str): If removing alternate positions, which alternate ID to keep
            remove_atom_hydrogen (bool): Remove hydrogen atoms
            add_atom_occ (bool): Add atom occupancy fields if not present
            remove_res_hetero (bool): Remove all HETATMs
            keep_chemicals (str, list): If removing HETATMs, keep specified chemical names
            keep_res_only (str, list): Keep ONLY specified resnames, deletes everything else!
            add_chain_id_if_empty (str): Add a chain ID if not present
            keep_chains (str, list): Keep only these chains

        Returns:
            str: Path to cleaned PDB file

        """

        if not self.structure_file:
            log.error('{}: no structure file, unable to clean'.format(self.id))
            return None

        clean_pdb_file = ssbio.protein.structure.utils.cleanpdb.clean_pdb(self.structure_path, out_suffix=out_suffix,
                                                                          outdir=outdir, force_rerun=force_rerun,
                                                                          remove_atom_alt=remove_atom_alt,
                                                                          remove_atom_hydrogen=remove_atom_hydrogen,
                                                                          keep_atom_alt_id=keep_atom_alt_id,
                                                                          add_atom_occ=add_atom_occ,
                                                                          remove_res_hetero=remove_res_hetero,
                                                                          keep_chemicals=keep_chemicals,
                                                                          keep_res_only=keep_res_only,
                                                                          add_chain_id_if_empty=add_chain_id_if_empty,
                                                                          keep_chains=keep_chains)

        return clean_pdb_file

    def add_mapped_chain_ids(self, mapped_chains):
        """Add chains by ID into the mapped_chains attribute

        Args:
            mapped_chains (str, list): Chain ID or list of IDs

        """
        mapped_chains = ssbio.utils.force_list(mapped_chains)

        for c in mapped_chains:
            if c not in self.mapped_chains:
                self.mapped_chains.append(c)
                log.debug('{}: added to list of mapped chains'.format(c))
            else:
                log.debug('{}: chain already in list of mapped chains, not adding'.format(c))

    def add_chain_ids(self, chains):
        """Add chains by ID into the chains attribute

        Args:
            chains (str, list): Chain ID or list of IDs

        """
        chains = ssbio.utils.force_list(chains)

        for c in chains:
            if self.chains.has_id(c):
                log.debug('{}: chain already present'.format(c))
            else:
                chain_prop = ChainProp(ident=c, pdb_parent=self.id)
                self.chains.append(chain_prop)
                log.debug('{}: added to chains list'.format(c))

    def get_structure_seqs(self, model):
        """Gather chain sequences and store in their corresponding ``ChainProp`` objects in the ``chains`` attribute.

        Args:
            model (Model): Biopython Model object of the structure you would like to parse

        """

        # Don't overwrite existing ChainProp objects
        dont_overwrite = []
        chains = list(model.get_chains())
        for x in chains:
            if self.chains.has_id(x.id):
                if self.chains.get_by_id(x.id).seq_record:
                    dont_overwrite.append(x.id)
        if len(dont_overwrite) == len(chains):
            log.debug('Not writing structure sequences, already stored')
            return

        # Returns the structures sequences with Xs added
        structure_seqs = ssbio.protein.structure.properties.residues.get_structure_seqrecords(model)
        log.debug('{}: gathered chain sequences'.format(self.id))

        # Associate with ChainProps
        for seq_record in structure_seqs:
            log.debug('{}: adding chain sequence to ChainProp'.format(seq_record.id))
            my_chain = self.chains.get_by_id(seq_record.id)
            my_chain.seq_record = seq_record

    def reset_chain_seq_records(self):
        for x in self.chains:
            x.reset_seq_record()

    def get_dict_with_chain(self, chain, only_keys=None, chain_keys=None, exclude_attributes=None, df_format=False):
        """get_dict method which incorporates attributes found in a specific chain. Does not overwrite any attributes
        in the original StructProp.

        Args:
            chain:
            only_keys:
            chain_keys:
            exclude_attributes:
            df_format:

        Returns:
            dict: attributes of StructProp + the chain specified

        """

        # Choose attributes to return, return everything in the object if a list is not specified
        if not only_keys:
            keys = list(self.__dict__.keys())
        else:
            keys = ssbio.utils.force_list(only_keys)

        # Remove keys you don't want returned
        if exclude_attributes:
            exclude_attributes = ssbio.utils.force_list(exclude_attributes)
            for x in exclude_attributes:
                if x in keys:
                    keys.remove(x)
        else:
            exclude_attributes = []

        exclude_attributes.extend(['mapped_chains', 'chains'])

        final_dict = {k: v for k, v in Object.get_dict(self, only_attributes=keys, exclude_attributes=exclude_attributes,
                                                       df_format=df_format).items()}

        chain_prop = self.chains.get_by_id(chain)
        # Filter out keys that show up in StructProp
        if not chain_keys:
            chain_keys = [x for x in chain_prop.get_dict().keys() if x not in final_dict]

        chain_dict = chain_prop.get_dict(only_attributes=chain_keys, df_format=df_format)
        final_dict.update(chain_dict)

        return final_dict

    def find_disulfide_bridges(self, threshold=3.0):
        """Run Biopython's search_ss_bonds to find potential disulfide bridges for each chain and store in ChainProp.

        Will add a list of tuple pairs into the annotations field, looks like this::

            [ ((' ', 79, ' '), (' ', 110, ' ')),
              ((' ', 174, ' '), (' ', 180, ' ')),
              ((' ', 369, ' '), (' ', 377, ' '))]

        Where each pair is a pair of cysteine residues close together in space.

        """

        if self.structure:
            parsed = self.structure
        else:
            parsed = self.parse_structure()

        if not parsed:
            log.error('{}: unable to open structure to find S-S bridges'.format(self.id))
            return

        disulfide_bridges = ssbio.protein.structure.properties.residues.search_ss_bonds(parsed.first_model,
                                                                                        threshold=threshold)
        if not disulfide_bridges:
            log.debug('{}: no disulfide bridges found'.format(self.id))

        for chain, bridges in disulfide_bridges.items():
            self.chains.get_by_id(chain).seq_record.annotations['SSBOND-biopython'] = disulfide_bridges[chain]
            log.debug('{}: found {} disulfide bridges'.format(chain, len(bridges)))
            log.debug('{}: stored disulfide bridges in the chain\'s seq_record letter_annotations'.format(chain))

    def get_dssp_annotations(self, outdir, force_rerun=False):
        """Run DSSP on this structure and store the DSSP annotations in the corresponding ChainProp SeqRecords

        Calculations are stored in the ChainProp's ``letter_annotations`` at the following keys:

            * ``SS-dssp``
            * ``RSA-dssp``
            * ``ASA-dssp``
            * ``PHI-dssp``
            * ``PSI-dssp``

        Args:
            outdir (str): Path to where DSSP dataframe will be stored.
            force_rerun (bool): If DSSP results should be recalculated

        TODO:
            * Also parse global properties, like total accessible surface area. Don't think Biopython parses those?

        """
        if self.structure:
            parsed = self.structure
        else:
            parsed = self.parse_structure()

        if not parsed:
            log.error('{}: unable to open structure to run DSSP'.format(self.id))
            return

        log.debug('{}: running DSSP'.format(self.id))
        dssp_results = ssbio.protein.structure.properties.dssp.get_dssp_df(model=parsed.first_model,
                                                                           pdb_file=self.structure_path,
                                                                           outdir=outdir,
                                                                           force_rerun=force_rerun)

        if dssp_results.empty:
            log.error('{}: unable to run DSSP'.format(self.id))
            return

        chains = dssp_results.chain.unique()
        dssp_summary = ssbio.protein.structure.properties.dssp.secondary_structure_summary(dssp_results)

        for chain in chains:
            ss = dssp_results[dssp_results.chain == chain].ss.tolist()
            exposure_rsa = dssp_results[dssp_results.chain == chain].exposure_rsa.tolist()
            exposure_asa = dssp_results[dssp_results.chain == chain].exposure_asa.tolist()
            phi = dssp_results[dssp_results.chain == chain].phi.tolist()
            psi = dssp_results[dssp_results.chain == chain].psi.tolist()

            chain_prop = self.chains.get_by_id(chain)
            chain_seq = chain_prop.seq_record

            # Making sure the X's are filled in
            ss = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq,
                                                                                      new_seq=ss,
                                                                                      fill_with='-')
            exposure_rsa = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq,
                                                                                                new_seq=exposure_rsa,
                                                                                                fill_with=float('Inf'))
            exposure_asa = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq,
                                                                                                new_seq=exposure_asa,
                                                                                                fill_with=float('Inf'))
            phi = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq,
                                                                                       new_seq=phi,
                                                                                       fill_with=float('Inf'))
            psi = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq,
                                                                                       new_seq=psi,
                                                                                       fill_with=float('Inf'))

            chain_prop.seq_record.annotations.update(dssp_summary[chain])

            chain_prop.seq_record.letter_annotations['SS-dssp'] = ss
            chain_prop.seq_record.letter_annotations['RSA-dssp'] = exposure_rsa
            chain_prop.seq_record.letter_annotations['ASA-dssp'] = exposure_asa
            chain_prop.seq_record.letter_annotations['PHI-dssp'] = phi
            chain_prop.seq_record.letter_annotations['PSI-dssp'] = psi
            log.debug('{}: stored DSSP annotations in chain seq_record letter_annotations'.format(chain))

    def get_msms_annotations(self, outdir, force_rerun=False):
        """Run MSMS on this structure and store the residue depths/ca depths in the corresponding ChainProp SeqRecords
        """
        # Now can run on Biopython Model objects exclusively thanks to Biopython updates
        # if self.file_type != 'pdb':
        #     raise ValueError('{}: unable to run MSMS with "{}" file type. Please change file type to "pdb"'.format(self.id,
        #                                                                                                     self.file_type))

        if self.structure:
            parsed = self.structure
        else:
            parsed = self.parse_structure()

        if not parsed:
            log.error('{}: unable to open structure to run MSMS'.format(self.id))
            return

        log.debug('{}: running MSMS'.format(self.id))
        # PDB ID is currently set to the structure file so the output name is the same with _msms.df appended to it
        msms_results = ssbio.protein.structure.properties.msms.get_msms_df(model=parsed.first_model, pdb_id=self.structure_path,
                                                                           outdir=outdir, force_rerun=force_rerun)
        if msms_results.empty:
            log.error('{}: unable to run MSMS'.format(self.id))
            return

        chains = msms_results.chain.unique()

        for chain in chains:
            res_depths = msms_results[msms_results.chain == chain].res_depth.tolist()
            ca_depths = msms_results[msms_results.chain == chain].ca_depth.tolist()

            chain_prop = self.chains.get_by_id(chain)
            chain_seq = chain_prop.seq_record

            # Making sure the X's are filled in
            res_depths = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq,
                                                                                              new_seq=res_depths,
                                                                                              fill_with=float('Inf'))

            ca_depths = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain_seq,
                                                                                             new_seq=ca_depths,
                                                                                             fill_with=float('Inf'))

            chain_prop.seq_record.letter_annotations['RES_DEPTH-msms'] = res_depths
            chain_prop.seq_record.letter_annotations['CA_DEPTH-msms'] = ca_depths
            log.debug('{}: stored residue depths in chain seq_record letter_annotations'.format(chain))

    def get_freesasa_annotations(self, outdir, include_hetatms=False, force_rerun=False):
        """Run ``freesasa`` on this structure and store the calculated properties in the corresponding ChainProps
        """
        if self.file_type != 'pdb':
            log.error('{}: unable to run freesasa with "{}" file type. Please change file type to "pdb"'.format(self.id,
                                                                                                                self.file_type))
            return

        # Parse the structure to store chain sequences
        if self.structure:
            parsed = self.structure
        else:
            parsed = self.parse_structure()

        if not parsed:
            log.error('{}: unable to open structure to run freesasa'.format(self.id))
            return

        # Set outfile name
        log.debug('{}: running freesasa'.format(self.id))
        if include_hetatms:
            outfile = '{}.freesasa_het.rsa'.format(self.id)
        else:
            outfile = '{}.freesasa_nohet.rsa'.format(self.id)

        # Run freesasa
        result = fs.run_freesasa(infile=self.structure_path,
                                 outfile=outfile,
                                 include_hetatms=include_hetatms,
                                 outdir=outdir,
                                 force_rerun=force_rerun)

        # Parse results
        result_parsed = fs.parse_rsa_data(result)
        prop_dict = defaultdict(lambda: defaultdict(list))
        for k, v in result_parsed.items():
            chain = k[0]
            for prop, calc in v.items():
                prop_dict[chain][prop].append(calc)

        # Reorganize and store results
        all_props = ['all_atoms_abs', 'all_atoms_rel', 'side_chain_abs', 'side_chain_rel', 'main_chain_abs',
                     'main_chain_rel', 'non_polar_abs', 'non_polar_rel', 'all_polar_abs', 'all_polar_rel']
        all_props_renamed = {'all_atoms_abs' : 'ASA_ALL-freesasa',
                             'all_atoms_rel' : 'RSA_ALL-freesasa',
                             'all_polar_abs' : 'ASA_POLAR-freesasa',
                             'all_polar_rel' : 'RSA_POLAR-freesasa',
                             'main_chain_abs': 'ASA_BACKBONE-freesasa',
                             'main_chain_rel': 'RSA_BACKBONE-freesasa',
                             'non_polar_abs' : 'ASA_NONPOLAR-freesasa',
                             'non_polar_rel' : 'RSA_NONPOLAR-freesasa',
                             'side_chain_abs': 'ASA_RESIDUE-freesasa',
                             'side_chain_rel': 'RSA_RESIDUE-freesasa'}

        ## Rename dictionary keys based on if HETATMs were included
        if include_hetatms:
            suffix = '_het'
        else:
            suffix = '_nohet'

        for k, v in all_props_renamed.items():
            all_props_renamed[k] = v + suffix

        for chain in self.chains:
            for prop in all_props:
                prop_list = ssbio.protein.structure.properties.residues.match_structure_sequence(orig_seq=chain.seq_record,
                                                                                                 new_seq=prop_dict[chain.id][prop],
                                                                                                 fill_with=float('Inf'),
                                                                                                 ignore_excess=True)
                chain.seq_record.letter_annotations[all_props_renamed[prop]] = prop_list
            log.debug('{}: stored freesasa calculations in chain seq_record letter_annotations'.format(chain))

    def view_structure(self, only_chains=None, opacity=1.0, recolor=False, gui=False):
        """Use NGLviewer to display a structure in a Jupyter notebook

        Args:
            only_chains (str, list): Chain ID or IDs to display
            opacity (float): Opacity of the structure
            recolor (bool): If structure should be cleaned and recolored to silver
            gui (bool): If the NGLview GUI should show up

        Returns:
            NGLviewer object

        """
        # TODO: show_structure_file does not work for MMTF files - need to check for that and load accordingly

        if ssbio.utils.is_ipynb():
            import nglview as nv
        else:
            raise EnvironmentError('Unable to display structure - not running in a Jupyter notebook environment')

        if not self.structure_file:
            raise ValueError("Structure file not loaded")

        only_chains = ssbio.utils.force_list(only_chains)
        to_show_chains = '( '
        for c in only_chains:
            to_show_chains += ':{} or'.format(c)
        to_show_chains = to_show_chains.strip(' or ')
        to_show_chains += ' )'

        if self.file_type == 'mmtf' or self.file_type == 'mmtf.gz':
            view = nv.NGLWidget()
            view.add_component(self.structure_path)
        else:
            view = nv.show_structure_file(self.structure_path, gui=gui)

        if recolor:
            view.clear_representations()
            if only_chains:
                view.add_cartoon(selection='{} and (not hydrogen)'.format(to_show_chains), color='silver', opacity=opacity)
            else:
                view.add_cartoon(selection='protein', color='silver', opacity=opacity)
        elif only_chains:
            view.clear_representations()
            view.add_cartoon(selection='{} and (not hydrogen)'.format(to_show_chains), color='silver', opacity=opacity)

        return view

    def add_residues_highlight_to_nglview(self, view, structure_resnums, chain=None, res_color='red'):
        """Add a residue number or numbers to an NGLWidget view object.

        Args:
            view (NGLWidget): NGLWidget view object
            structure_resnums (int, list): Residue number(s) to highlight, structure numbering
            chain (str, list): Chain ID or IDs of which residues are a part of. If not provided, all chains in the
                mapped_chains attribute will be used. If that is also empty, and exception is raised.
            res_color (str): Color to highlight residues with

        """
        if not chain:
            chain = self.mapped_chains
            if not chain:
                raise ValueError('Please input chain ID to display residue on')

        if isinstance(structure_resnums, list):
            structure_resnums = list(set(structure_resnums))
        elif isinstance(structure_resnums, int):
            structure_resnums = ssbio.utils.force_list(structure_resnums)
        else:
            raise ValueError('Input must either be a residue number of a list of residue numbers')

        to_show_chains = '( '
        for c in chain:
            to_show_chains += ':{} or'.format(c)
        to_show_chains = to_show_chains.strip(' or ')
        to_show_chains += ' )'

        to_show_res = '( '
        for m in structure_resnums:
            to_show_res += '{} or '.format(m)
        to_show_res = to_show_res.strip(' or ')
        to_show_res += ' )'

        log.info('Selection: {} and not hydrogen and {}'.format(to_show_chains, to_show_res))

        view.add_ball_and_stick(selection='{} and not hydrogen and {}'.format(to_show_chains, to_show_res), color=res_color)

    def add_scaled_residues_highlight_to_nglview(self, view, structure_resnums, chain=None, color='red',
                                                 unique_colors=False, opacity_range=(0.5,1), scale_range=(.7, 10)):
        """Add a list of residue numbers (which may contain repeating residues) to a view, or add a dictionary of
            residue numbers to counts. Size and opacity of added residues are scaled by counts.

        Args:
            view (NGLWidget): NGLWidget view object
            structure_resnums (int, list, dict): Residue number(s) to highlight, or a dictionary of residue number to
                frequency count
            chain (str, list): Chain ID or IDs of which residues are a part of. If not provided, all chains in the
                mapped_chains attribute will be used. If that is also empty, and exception is raised.
            color (str): Color to highlight residues with
            unique_colors (bool): If each mutation should be colored uniquely (will override color argument)
            opacity_range (tuple): Min/max opacity values (residues that have higher frequency counts will be opaque)
            scale_range (tuple): Min/max size values (residues that have higher frequency counts will be bigger)

        """
        # TODO: likely to move these functions to a separate nglview/utils folder since they are not coupled to the structure
        # TODO: add color by letter_annotations!
        if not chain:
            chain = self.mapped_chains
            if not chain:
                raise ValueError('Please input chain ID to display residue on')
        else:
            chain = ssbio.utils.force_list(chain)

        if isinstance(structure_resnums, dict):
            opacity_dict = ssbio.utils.scale_calculator(opacity_range[0], structure_resnums, rescale=opacity_range)
            scale_dict = ssbio.utils.scale_calculator(scale_range[0], structure_resnums, rescale=scale_range)
        else:
            opacity_dict = {x: max(opacity_range) for x in ssbio.utils.force_list(structure_resnums)}
            scale_dict = {x: max(scale_range) for x in ssbio.utils.force_list(structure_resnums)}

        if isinstance(structure_resnums, list):
            structure_resnums = list(set(structure_resnums))
        elif isinstance(structure_resnums, dict):
            structure_resnums = list(structure_resnums.keys())
        elif isinstance(structure_resnums, int):
            structure_resnums = ssbio.utils.force_list(structure_resnums)
        else:
            raise ValueError('Input must either be a list of residue numbers or a dictionary of residue numbers '
                             'and their frequency.')

        colors = sns.color_palette("hls", len(structure_resnums)).as_hex()

        to_show_chains = '( '
        for c in chain:
            to_show_chains += ':{} or'.format(c)
        to_show_chains = to_show_chains.strip(' or ')
        to_show_chains += ' )'

        for i, x in enumerate(structure_resnums):
            if isinstance(x, tuple):
                to_show_res = '( '
                for mut in x:
                    to_show_res += '{} or '.format(mut)
                to_show_res = to_show_res.strip(' or ')
                to_show_res += ' )'
            else:
                to_show_res = x

            log.info('Selection: {} and not hydrogen and {}'.format(to_show_chains, to_show_res))

            if unique_colors:
                view.add_ball_and_stick(selection='{} and not hydrogen and {}'.format(to_show_chains, to_show_res),
                                        color=colors[i], opacity=opacity_dict[x], scale=scale_dict[x])
            else:
                view.add_ball_and_stick(selection='{} and not hydrogen and {}'.format(to_show_chains, to_show_res),
                                        color=color, opacity=opacity_dict[x], scale=scale_dict[x])

    def __json_decode__(self, **attrs):
        for k, v in attrs.items():
            if k == 'chains':
                setattr(self, k, DictList(v))
            else:
                setattr(self, k, v)
예제 #9
0
def compare_reactions(reaction1,
                      reaction2,
                      details=None,
                      id1='first',
                      id2='second'):
    """ Compare two lists of cobra.core.Reaction objects and report differences.

    To determine if two reactions are the same, the function compares the following 
    attributes: (1) ID {'reaction_id'}, (2) name {'reaction_name'}, (3) bounds
    {'reaction_bounds'}, (4) definition {'reaction_definition'}, (5) gene reaction 
    rule {'reaction_gpr'}. Include the value in {} in the details parameter to
    display the details of reactions where the values are different.
    
    Parameters
    ----------
    reaction1 : cobra.core.DictList
        First list of cobra.core.Reaction objects to analyze
    reaction2 : cobra.core.DictList
        Second list of cobra.core.Reaction objects to analyze
    details : set, optional
        When specified, print details on given types of differences
    id1 : str, optional
        ID for labeling first list of reactions
    id2 : str, optional
        ID for labeling second list of reactions
    """

    if details is None:
        details = set()

    print('REACTIONS\n' + '---------')
    print('{0} reactions in {1}'.format(len(reaction1), id1))
    print('{0} reactions in {1}\n'.format(len(reaction2), id2))

    # See if reactions from first model are in the second model.
    num_matched = 0
    reaction_only_in_one = DictList()
    different_name = DictList()
    different_bounds = DictList()
    different_definition = DictList()
    different_genes = DictList()
    for r1 in reaction1:
        try:
            r2 = reaction2.get_by_id(r1.id)
            num_matched += 1
            if r1.name != r2.name:
                different_name.append(r1)
            if r1.bounds != r2.bounds:
                different_bounds.append(r1)
            if r1.reaction != r2.reaction:
                different = False
                for met, coefficient in iteritems(r1.metabolites):
                    if not isclose(r2.get_coefficient(met.id), coefficient):
                        different = True
                if different:
                    different_definition.append(r1)
            if r1.gene_reaction_rule != r2.gene_reaction_rule:
                different_genes.append(r1)
        except KeyError:
            reaction_only_in_one.append(r1)
    print('{0} reactions in {1} and {2}'.format(num_matched, id1, id2))
    print('{0} reactions only in {1}\n'.format(len(reaction_only_in_one), id1))

    # If requested, show the details on reactions only in the first model.
    if 'reaction_id' in details and len(reaction_only_in_one) > 0:
        reaction_only_in_one.sort(key=lambda x: x.id)
        output = [[rxn.id,
                   format_long_string(rxn.name, 20), rxn.reaction]
                  for rxn in reaction_only_in_one]
        print(
            tabulate(output, tablefmt='simple', headers=reaction_header) +
            '\n')

    # See if reactions from second model are in the first model.
    num_matched = 0
    reaction_only_in_two = DictList()
    for r2 in reaction2:
        if reaction1.has_id(r2.id):
            num_matched += 1
        else:
            reaction_only_in_two.append(r2)
    print('{0} reactions in both {1} and {2}'.format(num_matched, id1, id2))
    print('{0} reactions only in {1}\n'.format(len(reaction_only_in_two), id2))

    # If requested, show the details on reactions only in the second model.
    if 'reaction_id' in details and len(reaction_only_in_two) > 0:
        reaction_only_in_two.sort(key=lambda x: x.id)
        output = [[rxn.id,
                   format_long_string(rxn.name, 20), rxn.reaction]
                  for rxn in reaction_only_in_two]
        print('\n' +
              tabulate(output, tablefmt='simple', headers=reaction_header) +
              '\n')

    # Display details on reaction attribute differences.
    print('{0} reactions with different names'.format(len(different_name)))
    if 'reaction_name' in details and len(different_name) > 0:
        different_name.sort(key=lambda x: x.id)
        output = [[rxn.id, rxn.name,
                   reaction2.get_by_id(rxn.id).name] for rxn in different_name]
        print('\n' +
              tabulate(output, tablefmt='simple', headers=difference_header) +
              '\n')
    print('{0} reactions with different bounds'.format(len(different_bounds)))
    if 'reaction_bounds' in details and len(different_bounds) > 0:
        different_bounds.sort(key=lambda x: x.id)
        output = [[rxn.id, rxn.bounds,
                   reaction2.get_by_id(rxn.id).bounds]
                  for rxn in different_bounds]
        print('\n' +
              tabulate(output, tablefmt='simple', headers=difference_header) +
              '\n')
    print('{0} reactions with different definitions'.format(
        len(different_definition)))
    if 'reaction_definition' in details and len(different_definition) > 0:
        different_definition.sort(key=lambda x: x.id)
        output = [[rxn.id, rxn.reaction,
                   reaction2.get_by_id(rxn.id).reaction]
                  for rxn in different_definition]
        print('\n' +
              tabulate(output, tablefmt='simple', headers=difference_header) +
              '\n')
    print('{0} reactions with different genes'.format(len(different_genes)))
    if 'reaction_gpr' in details and len(different_genes) > 0:
        different_genes.sort(key=lambda x: x.id)
        output = [[
            rxn.id, rxn.gene_reaction_rule,
            reaction2.get_by_id(rxn.id).gene_reaction_rule
        ] for rxn in different_genes]
        print('\n' +
              tabulate(output, tablefmt='simple', headers=difference_header) +
              '\n')

    return
예제 #10
0
def compare_genes(gene1, gene2, details=None, id1='first', id2='second'):
    """ Compare two lists of cobra.core.Gene objects and report differences.

    To determine if two genes are the same, the function compares the following 
    attributes: (1) ID {'gene_id'}, (2) name {'gene_name'}. Include the value 
    in {} in the details parameter to display the details of genes where the 
    values are different.
    
    Parameters
    ----------
    gene1 : cobra.core.DictList
        First list of cobra.core.Gene objects to analyze
    gene2 : cobra.core.DictList
        Second list of cobra.core.Gene objects to analyze
    details : set, optional
        When specified, print details on given types of differences
    id1 : str, optional
        ID for labeling first list of genes
    id2 : str, optional
        ID for labeling second list of genes
    """

    if details is None:
        details = set()

    print('\nGENES\n' + '------')
    print('{0} genes in {1}'.format(len(gene1), id1))
    print('{0} genes in {1}\n'.format(len(gene2), id2))

    # See if genes from first list are in the second list.
    num_matched = 0
    gene_only_in_one = DictList()
    different_name = DictList()
    for g1 in gene1:
        try:
            g2 = gene2.get_by_id(g1.id)
            num_matched += 1
            if g1.name.lower() != g2.name.lower():
                different_name.append(g1)
        except KeyError:
            gene_only_in_one.append(g1)
    print('{0} genes in both {1} and {2}'.format(num_matched, id1, id2))
    print('{0} genes only in {1}\n'.format(len(gene_only_in_one), id1))
    if 'gene_id' in details and len(gene_only_in_one) > 0:
        gene_only_in_one.sort(key=lambda x: x.id)
        output = [[gene.id, format_long_string(gene.name, 90)]
                  for gene in gene_only_in_one]
        print('\n' + tabulate(output, tablefmt='simple', headers=gene_header) +
              '\n')

    # See if genes from second list are in the first list.
    num_matched = 0
    gene_only_in_two = DictList()
    for g2 in gene2:
        if gene1.has_id(g2.id):
            num_matched += 1
        else:
            gene_only_in_two.append(g2)
    print('{0} genes in both {1} and {2}'.format(num_matched, id1, id2))
    print('{0} genes only in {1}\n'.format(len(gene_only_in_two), id2))
    if 'gene_id' in details and len(gene_only_in_two) > 0:
        gene_only_in_two.sort(key=lambda x: x.id)
        output = [[gene.id, format_long_string(gene.name, 90)]
                  for gene in gene_only_in_two]
        print('\n' + tabulate(output, tablefmt='simple', headers=gene_header) +
              '\n')

    # Display details on gene attribute differences.
    print('{0} genes with different names'.format(len(different_name)))
    if 'gene_name' in details and len(different_name) > 0:
        different_name.sort(key=lambda x: x.id)
        output = [[gene.id, gene.name,
                   gene2.get_by_id(gene.id).name] for gene in different_name]
        print('\n' +
              tabulate(output, tablefmt='simple', headers=difference_header) +
              '\n')

    return
예제 #11
0
def compare_metabolites(metabolite1,
                        metabolite2,
                        details=None,
                        id1='first',
                        id2='second'):
    """ Compare two lists of cobra.core.Metabolite objects and report differences.

    To determine if two metabolites are the same, the function compares the following 
    attributes: (1) ID {'metabolite_id'}, (2) name {'metabolite_name'}, (3) formula
    {'metabolite_formula'}, (4) charge {'metabolite_charge'}, (5) compartment 
    {'metabolite_compartment'}. Include the value in {} in the details parameter to 
    display the details of metabolites where the values are different.
    
    Parameters
    ----------
    metabolite1 : cobra.core.DictList
        First list of cobra.core.Metabolite objects to analyze
    metabolite2 : cobra.core.DictList
        Second list of cobra.core.Metabolite objects to analyze
    details : set, optional
        When specified, print details on given types of differences
    id1 : str, optional
        ID for labeling first list of metabolites
    id2 : str, optional
        ID for labeling second list of metabolites
    """

    if details is None:
        details = set()

    print('\nMETABOLITES\n' + '-----------')
    print('{0} metabolites in {1}'.format(len(metabolite1), id1))
    print('{0} metabolites in {1}\n'.format(len(metabolite2), id2))

    # See if metabolites from first model are in the second model.
    num_matched = 0
    metabolite_only_in_one = DictList()
    different_name = DictList()
    different_formula = DictList()
    different_charge = DictList()
    different_compartment = DictList()
    for m1 in metabolite1:
        try:
            m2 = metabolite2.get_by_id(m1.id)
            num_matched += 1
            if m1.name != m2.name:
                different_name.append(m1)
            if m1.formula != m2.formula:
                different_formula.append(m1)
            if m1.charge != m2.charge:
                different_charge.append(m1)
            if m1.compartment != m2.compartment:
                different_compartment.append(m1)
        except KeyError:
            metabolite_only_in_one.append(m1)
    print('{0} metabolites in both {1} and {2}'.format(num_matched, id1, id2))
    print('{0} metabolites only in {1}\n'.format(len(metabolite_only_in_one),
                                                 id1))
    if 'metabolite_id' in details and len(metabolite_only_in_one) > 0:
        metabolite_only_in_one.sort(key=lambda x: x.id)
        output = [[met.id, format_long_string(met.name, 70)]
                  for met in metabolite_only_in_one]
        print('\n' +
              tabulate(output, tablefmt='simple', headers=metabolite_header) +
              '\n')

    # See if metabolites from second model are in the first model.
    num_matched = 0
    metabolite_only_in_two = DictList()
    for m2 in metabolite2:
        if metabolite1.has_id(m2.id):
            num_matched += 1
        else:
            metabolite_only_in_two.append(m2)
    print('{0} metabolites in both {1} and {2}'.format(num_matched, id1, id2))
    print('{0} metabolites only in {1}\n'.format(len(metabolite_only_in_two),
                                                 id2))
    if 'metabolite_id' in details and len(metabolite_only_in_two) > 0:
        metabolite_only_in_two.sort(key=lambda x: x.id)
        output = [[met.id, format_long_string(met.name, 70)]
                  for met in metabolite_only_in_two]
        print('\n' +
              tabulate(output, tablefmt='simple', headers=metabolite_header) +
              '\n')

    # Display details on metabolite attribute differences.
    print('{0} metabolites with different names'.format(len(different_name)))
    if 'metabolite_name' in details and len(different_name) > 0:
        different_name.sort(key=lambda x: x.id)
        output = [[met.id, met.name,
                   metabolite2.get_by_id(met.id).name]
                  for met in different_name]
        print('\n' +
              tabulate(output, tablefmt='simple', headers=difference_header) +
              '\n')
    print('{0} metabolites with different formulas'.format(
        len(different_formula)))
    if 'metabolite_formula' in details and len(different_formula) > 0:
        different_formula.sort(key=lambda x: x.id)
        output = [[met.id, met.formula,
                   metabolite2.get_by_id(met.id).formula]
                  for met in different_formula]
        print('\n' +
              tabulate(output, tablefmt='simple', headers=difference_header) +
              '\n')
    print('{0} metabolites with different charges'.format(
        len(different_charge)))
    if 'metabolite_charge' in details and len(different_charge) > 0:
        different_charge.sort(key=lambda x: x.id)
        output = [[met.id, met.charge,
                   metabolite2.get_by_id(met.id).charge]
                  for met in different_charge]
        print('\n' +
              tabulate(output, tablefmt='simple', headers=difference_header) +
              '\n')
    print('{0} metabolites with different compartments'.format(
        len(different_compartment)))
    if 'metabolite_compartment' in details and len(different_compartment) > 0:
        different_compartment.sort(key=lambda x: x.id)
        output = [[
            met.id, met.compartment,
            metabolite2.get_by_id(met.id).compartment
        ] for met in different_compartment]
        print('\n' +
              tabulate(output, tablefmt='simple', headers=difference_header) +
              '\n')

    return
예제 #12
0
def dict_list():
    obj = Object("test1")
    test_list = DictList()
    test_list.append(obj)
    return obj, test_list
예제 #13
0
def dict_list():
    obj = Object("test1")
    test_list = DictList()
    test_list.append(obj)
    return obj, test_list
예제 #14
0
class ATLAS(Object):
    """Class to represent an ATLAS workflow to carry out multi-strain comparisons

    Main steps are:

    #. Strain-specific model construction based on orthologous genes & systems modeling
    #. Phylogenetic analysis to pick out important genes
    #. GEM-PRO of the "base strain"
    #. Structure property calculation & integrated structural systems analysis

    Each step may generate a report and also request additional files if something is missing

    """
    def __init__(self,
                 atlas_name,
                 root_dir,
                 reference_gempro,
                 reference_genome_path=None,
                 description=None):
        """Prepare a GEM-PRO model for ATLAS analysis

        Args:
            atlas_name (str): Name of your ATLAS project
            root_dir (str): Path to where the folder named after ``atlas_name`` will be created.
            reference_gempro (GEMPRO): GEM-PRO model to use as the reference genome
            reference_genome_path (str): Path to reference genome FASTA file
            description (str): Optional string to describe your project

        """
        Object.__init__(self, id=atlas_name, description=description)

        # Create directories
        self._root_dir = None
        self.root_dir = root_dir

        self.strains = DictList()
        self.df_orthology_matrix = pd.DataFrame()
        # Mark if the orthology matrix has gene IDs (thus we need to retrieve seqs from the genome file) or if
        # it is in the orthology matrix itself
        self._orthology_matrix_has_sequences = False

        # Load the GEM-PRO (could be a model, could just be a list of genes)
        # Check if there is a genome file associated with this model - if not, write all sequences and use that
        self.reference_gempro = reference_gempro
        if not reference_genome_path and not self.reference_gempro.genome_path:
            self.reference_gempro.genome_path = self.reference_gempro.write_representative_sequences_file(
                outname=self.reference_gempro.id)
        else:
            self.reference_gempro.genome_path = reference_genome_path
            # TODO: must also check if reference_genome_path gene IDs can be matched to the reference_gempro

        # Also create an attribute
        self._empty_reference_gempro = None
        if self.reference_gempro.model:
            # If there is a SBML model associated with the GEMPRO, copy that model
            self._empty_reference_gempro = GEMPRO(
                gem_name='Copied reference GEM-PRO',
                gem=self.reference_gempro.model.copy())
            # Reset the GenePro attributes
            for x in self._empty_reference_gempro.genes:
                x.reset_protein()
        else:
            # Otherwise, just copy the list of genes over and rename the IDs
            strain_genes = [x.id for x in self.reference_gempro.genes]
            if len(strain_genes) == 0:
                raise ValueError(
                    'GEM-PRO has no genes, unable to run multi-strain analysis'
                )
            self._empty_reference_gempro = GEMPRO(
                gem_name='Copied reference GEM-PRO', genes_list=strain_genes)

    @property
    def root_dir(self):
        """str: Directory where ATLAS project folder named after the attribute ``base_dir`` is located"""
        return self._root_dir

    @root_dir.setter
    def root_dir(self, path):
        if not path:
            raise ValueError('No path specified')

        if not op.exists(path):
            raise ValueError('{}: folder does not exist'.format(path))

        if self._root_dir:
            log.info(
                'Changing root directory of project "{}" from {} to {}'.format(
                    self.id, self.root_dir, path))

            if not op.exists(op.join(path, self.id)):
                raise IOError(
                    'Project "{}" does not exist in folder {}'.format(
                        self.id, path))
        else:
            log.info('Creating project directory in folder {}'.format(path))

        self._root_dir = path

        for d in [
                self.base_dir, self.model_dir, self.data_dir,
                self.sequences_dir, self.sequences_by_gene_dir,
                self.sequences_by_organism_dir
        ]:
            ssbio.utils.make_dir(d)

        log.info('{}: project location'.format(self.base_dir))

    @property
    def base_dir(self):
        """str: ATLAS project folder"""
        if self.root_dir:
            return op.join(self.root_dir, self.id)
        else:
            return None

    @property
    def model_dir(self):
        """str: Directory where strain-specific GEMs are stored"""
        if self.base_dir:
            return op.join(self.base_dir, 'model')
        else:
            return None

    @property
    def data_dir(self):
        """str: Directory where all data (dataframes and more) will be stored"""
        if self.base_dir:
            return op.join(self.base_dir, 'data')
        else:
            return None

    @property
    def sequences_dir(self):
        """str: Base directory for genome protein sequences and alignments"""
        if self.base_dir:
            return op.join(self.base_dir, 'sequences')
        else:
            return None

    @property
    def sequences_by_gene_dir(self):
        """str: Directory where all gene specific information and pairwise alignments are stored"""
        if self.sequences_dir:
            return op.join(self.sequences_dir, 'by_gene')
        else:
            return None

    @property
    def sequences_by_organism_dir(self):
        """str: Directory where all strain specific genome and BLAST files are stored"""
        if self.sequences_dir:
            return op.join(self.sequences_dir, 'by_organism')
        else:
            return None

    # def _copy_reference_gempro(self, new_id):
    #     """Copy the base strain GEM-PRO into a new GEM-PRO with a specified ID.
    #
    #     Appends the model to the strains attribute.
    #
    #     Args:
    #         new_id (str): New ID to be assigned to the copied model
    #
    #     Returns:
    #         GEMPRO: copied GEM-PRO to represent the new strain
    #
    #     """
    #     logging.disable(logging.WARNING)
    #     if self.reference_gempro.model:
    #         # If there is a SBML model associated with the GEMPRO, copy that model
    #         copied_model = GEMPRO(gem_name=new_id, gem=self._model_to_copy.model.copy())
    #         copied_model.model.id = new_id
    #     else:
    #         # Otherwise, just copy the list of genes over and rename the IDs
    #         strain_genes = [x.id for x in self._model_to_copy.genes]
    #         copied_model = GEMPRO(gem_name=new_id, genes_list=strain_genes)
    #     # Re-enable logging
    #     logging.disable(logging.NOTSET)
    #
    #     self.strains.append(copied_model)
    #     log.debug('{}: new model copied from base model'.format(new_id))
    #
    #     return self.strains.get_by_id(new_id)

    def load_strain(self, strain_id, strain_genome_file):
        """Load a strain as a new GEM-PRO by its ID and associated genome file. Stored in the ``strains`` attribute.

        Args:
            strain_id (str): Strain ID
            strain_genome_file (str): Path to strain genome file

        """
        logging.disable(logging.WARNING)
        strain_gp = GEMPRO(gem_name=strain_id, genome_path=strain_genome_file)
        logging.disable(logging.NOTSET)

        self.strains.append(strain_gp)
        return self.strains.get_by_id(strain_id)

    def download_patric_genomes(self, ids, force_rerun=False):
        """Download genome files from PATRIC given a list of PATRIC genome IDs and load them as strains.

        Args:
            ids (str, list): PATRIC ID or list of PATRIC IDs
            force_rerun (bool): If genome files should be downloaded again even if they exist

        """
        ids = ssbio.utils.force_list(ids)

        counter = 0
        log.info('Downloading sequences from PATRIC...')
        for patric_id in tqdm(ids):
            f = ssbio.databases.patric.download_coding_sequences(
                patric_id=patric_id,
                seqtype='protein',
                outdir=self.sequences_by_organism_dir,
                force_rerun=force_rerun)
            if f:
                self.load_strain(patric_id, f)
                counter += 1
                log.debug('{}: downloaded sequence'.format(patric_id))
            else:
                log.warning(
                    '{}: unable to download sequence'.format(patric_id))

        log.info(
            'Created {} new strain GEM-PROs, accessible at "strains" attribute'
            .format(counter))

    def get_orthology_matrix(self,
                             pid_cutoff=None,
                             bitscore_cutoff=None,
                             evalue_cutoff=None,
                             filter_condition='OR',
                             remove_strains_with_no_orthology=True,
                             remove_strains_with_no_differences=False,
                             remove_genes_not_in_base_model=True):
        """Create the orthology matrix by finding best bidirectional BLAST hits. Genes = rows, strains = columns

        Runs run_makeblastdb, run_bidirectional_blast, and calculate_bbh for protein sequences.

        Args:
            pid_cutoff (float): Minimum percent identity between BLAST hits to filter for in the range [0, 100]
            bitscore_cutoff (float): Minimum bitscore allowed between BLAST hits
            evalue_cutoff (float): Maximum E-value allowed between BLAST hits
            filter_condition (str): 'OR' or 'AND', how to combine cutoff filters. 'OR' gives more results since it
            is less stringent, as you will be filtering for hits with (>80% PID or >30 bitscore or <0.0001 evalue).
            remove_strains_with_no_orthology (bool): Remove strains which have no orthologous genes found
            remove_strains_with_no_differences (bool): Remove strains which have all the same genes as the base model.
                Default is False because since orthology is found using a PID cutoff, all genes may be present but
                differences may be on the sequence level.
            remove_genes_not_in_base_model (bool): Remove genes from the orthology matrix which are not present in our
                base model. This happens if we use a genome file for our model that has other genes in it.

        Returns:
            DataFrame: Orthology matrix calculated from best bidirectional BLAST hits.

        """
        # TODO: document and test other cutoffs

        # Get the path to the reference genome
        r_file = self.reference_gempro.genome_path

        bbh_files = {}

        log.info(
            'Running bidirectional BLAST and finding best bidirectional hits (BBH)...'
        )
        for strain_gempro in tqdm(self.strains):
            g_file = strain_gempro.genome_path

            # Run bidirectional BLAST
            log.debug('{} vs {}: Running bidirectional BLAST'.format(
                self.reference_gempro.id, strain_gempro.id))
            r_vs_g, g_vs_r = ssbio.protein.sequence.utils.blast.run_bidirectional_blast(
                reference=r_file,
                other_genome=g_file,
                dbtype='prot',
                outdir=self.sequences_by_organism_dir)

            # Using the BLAST files, find the BBH
            log.debug('{} vs {}: Finding BBHs'.format(self.reference_gempro.id,
                                                      strain_gempro.id))
            bbh = ssbio.protein.sequence.utils.blast.calculate_bbh(
                blast_results_1=r_vs_g,
                blast_results_2=g_vs_r,
                outdir=self.sequences_by_organism_dir)
            bbh_files[strain_gempro.id] = bbh

        # Make the orthologous genes matrix
        log.info('Creating orthology matrix from BBHs...')
        ortho_matrix = ssbio.protein.sequence.utils.blast.create_orthology_matrix(
            r_name=self.reference_gempro.id,
            genome_to_bbh_files=bbh_files,
            pid_cutoff=pid_cutoff,
            bitscore_cutoff=bitscore_cutoff,
            evalue_cutoff=evalue_cutoff,
            filter_condition=filter_condition,
            outname='{}_{}_orthology.csv'.format(self.reference_gempro.id,
                                                 'prot'),
            outdir=self.data_dir)

        log.info(
            'Saved orthology matrix at {}. See the "df_orthology_matrix" attribute.'
            .format(ortho_matrix))
        self.df_orthology_matrix = pd.read_csv(ortho_matrix, index_col=0)

        # Filter the matrix to genes only in our analysis, and also check for strains with no differences or no orthologous genes
        self._filter_orthology_matrix(
            remove_strains_with_no_orthology=remove_strains_with_no_orthology,
            remove_strains_with_no_differences=
            remove_strains_with_no_differences,
            remove_genes_not_in_base_model=remove_genes_not_in_base_model)

    # def load_manual_orthology_matrix(self, df, clean_names=True,
    #                                  remove_strains_with_no_orthology=True,
    #                                  remove_strains_with_no_differences=False,
    #                                  remove_genes_not_in_base_model=True):
    #     """Load a manually curated orthology matrix to use in ATLAS. Genes = rows, strains = columns.
    #
    #     Args:
    #         df (DataFrame): Pandas DataFrame with genes as the rows and strains as the columns
    #         clean_names (bool): Remove unwanted characters from gene names and strain IDs
    #         remove_strains_with_no_orthology (bool): Remove strains which have no orthologous genes found
    #         remove_strains_with_no_differences (bool): Remove strains which have all the same genes as the base model.
    #             Default is False because since orthology is found using a PID cutoff, all genes may be present but
    #             differences may be on the sequence level.
    #         remove_genes_not_in_base_model (bool): Remove genes from the orthology matrix which are not present in our
    #             base model. This happens if we use a genome file for our model that has other genes in it.
    #
    #     """
    #     self._orthology_matrix_has_sequences = True
    #
    #     if clean_names:
    #         new_rows = [custom_slugify(x) for x in df.index]
    #         new_cols = [custom_slugify(y) for y in df.columns]
    #         df.index = new_rows
    #         df.columns = new_cols
    #
    #     self.df_orthology_matrix = df
    #
    #     # Make the copies of the base model
    #     for strain_id in tqdm(self.df_orthology_matrix.columns):
    #         self._copy_reference_gempro(new_id=strain_id)
    #
    #     # Filter the strains and orthology matrix
    #     self._filter_orthology_matrix(remove_strains_with_no_orthology=remove_strains_with_no_orthology,
    #                                        remove_strains_with_no_differences=remove_strains_with_no_differences,
    #                                        remove_genes_not_in_base_model=remove_genes_not_in_base_model)

    def _filter_orthology_matrix(self,
                                 remove_strains_with_no_orthology=True,
                                 remove_strains_with_no_differences=False,
                                 remove_genes_not_in_base_model=True):
        """Filters the orthology matrix by removing genes not in our base model, and also
            removes strains from the analysis which have: 0 orthologous genes or no difference from the base strain.

        Args:
            remove_strains_with_no_orthology (bool): Remove strains which have no orthologous genes found
            remove_strains_with_no_differences (bool): Remove strains which have all the same genes as the base model.
                Default is False because since orthology is found using a PID cutoff, all genes may be present but
                differences may be on the sequence level.
            remove_genes_not_in_base_model (bool): Remove genes from the orthology matrix which are not present in our
                base model. This happens if we use a genome file for our model that has other genes in it.

        """

        if len(self.df_orthology_matrix) == 0:
            raise RuntimeError('Empty orthology matrix')

        initial_num_strains = len(self.strains)

        # Adding names to the row and column of the orthology matrix
        self.df_orthology_matrix = self.df_orthology_matrix.rename_axis(
            'gene').rename_axis("strain", axis="columns")

        # Gene filtering (of the orthology matrix)
        if remove_genes_not_in_base_model:
            # Check for gene IDs that are in the model and not in the orthology matrix
            # This is probably because: the CDS FASTA file for the base strain did not contain the correct ID
            # for the gene and consequently was not included in the orthology matrix
            # Save these and report them
            reference_strain_gene_ids = [
                x.id for x in self.reference_gempro.genes
            ]
            self.missing_in_orthology_matrix = [
                x for x in reference_strain_gene_ids
                if x not in self.df_orthology_matrix.index.tolist()
            ]
            self.missing_in_reference_strain = [
                y for y in self.df_orthology_matrix.index.tolist()
                if y not in reference_strain_gene_ids
            ]

            # Filter the matrix for genes within our base model only
            self.df_orthology_matrix = self.df_orthology_matrix[
                self.df_orthology_matrix.index.isin(reference_strain_gene_ids)]
            log.info(
                'Filtered orthology matrix for genes present in base model')
            log.warning(
                '{} genes are in your base model but not your orthology matrix, see the attribute "missing_in_orthology_matrix"'
                .format(len(self.missing_in_orthology_matrix)))
            log.warning(
                '{} genes are in the orthology matrix but not your base model, see the attribute "missing_in_reference_strain"'
                .format(len(self.missing_in_reference_strain)))

        # Strain filtering
        for strain_gempro in self.strains.copy():
            if remove_strains_with_no_orthology:
                if strain_gempro.id not in self.df_orthology_matrix.columns:
                    self.strains.remove(strain_gempro)
                    log.info(
                        '{}: no orthologous genes found for this strain, removed from analysis.'
                        .format(strain_gempro.id))
                    continue
                elif self.df_orthology_matrix[strain_gempro.id].isnull().all():
                    self.strains.remove(strain_gempro)
                    log.info(
                        '{}: no orthologous genes found for this strain, removed from analysis.'
                        .format(strain_gempro.id))
                    continue

            if remove_strains_with_no_differences:
                not_in_strain = self.df_orthology_matrix[pd.isnull(
                    self.df_orthology_matrix[strain_gempro.id])][
                        strain_gempro.id].index.tolist()
                if len(not_in_strain) == 0:
                    self.strains.remove(strain_gempro)
                    log.info(
                        '{}: strain has no differences from the base, removed from analysis.'
                    )
                    continue

        log.info('{} strains to be analyzed, {} strains removed'.format(
            len(self.strains), initial_num_strains - len(self.strains)))

    def _pare_down_model(self, strain_gempro, genes_to_remove):
        """Mark genes as non-functional in a GEM-PRO. If there is a COBRApy model associated with it, the
            COBRApy method delete_model_genes is utilized to delete genes.

        Args:
            strain_gempro (GEMPRO): GEMPRO object
            genes_to_remove (list): List of gene IDs to remove from the model

        """
        # Filter out genes in genes_to_remove which do not show up in the model
        strain_genes = [x.id for x in strain_gempro.genes]
        genes_to_remove.extend(self.missing_in_orthology_matrix)
        genes_to_remove = list(
            set(genes_to_remove).intersection(set(strain_genes)))

        if len(genes_to_remove) == 0:
            log.info('{}: no genes marked non-functional'.format(
                strain_gempro.id))
            return
        else:
            log.debug('{}: {} genes to be marked non-functional'.format(
                strain_gempro.id, len(genes_to_remove)))

        # If a COBRApy model exists, utilize the delete_model_genes method
        if strain_gempro.model:
            strain_gempro.model._trimmed = False
            strain_gempro.model._trimmed_genes = []
            strain_gempro.model._trimmed_reactions = {}

            # Delete genes!
            cobra.manipulation.delete_model_genes(strain_gempro.model,
                                                  genes_to_remove)

            if strain_gempro.model._trimmed:
                log.info('{}: marked {} genes as non-functional, '
                         'deactivating {} reactions'.format(
                             strain_gempro.id,
                             len(strain_gempro.model._trimmed_genes),
                             len(strain_gempro.model._trimmed_reactions)))
        # Otherwise, just mark the genes as non-functional
        else:
            for g in genes_to_remove:
                strain_gempro.genes.get_by_id(g).functional = False
            log.info('{}: marked {} genes as non-functional'.format(
                strain_gempro.id, len(genes_to_remove)))

    def _load_strain_sequences(self, strain_gempro):
        """Load strain sequences from the orthology matrix into the base model for comparisons, and into the
        strain-specific model itself.

        """
        if self._orthology_matrix_has_sequences:  # Load directly from the orthology matrix if it contains sequences
            strain_sequences = self.df_orthology_matrix[
                strain_gempro.id].to_dict()
        else:  # Otherwise load from the genome file if the orthology matrix contains gene IDs
            # Load the genome FASTA file
            log.debug('{}: loading strain genome CDS file'.format(
                strain_gempro.genome_path))
            strain_sequences = SeqIO.index(strain_gempro.genome_path, 'fasta')

        for strain_gene in strain_gempro.genes:
            if strain_gene.functional:
                if self._orthology_matrix_has_sequences:
                    strain_gene_key = strain_gene.id
                else:
                    # Pull the gene ID of the strain from the orthology matrix
                    strain_gene_key = self.df_orthology_matrix.loc[
                        strain_gene.id, strain_gempro.id]
                    log.debug(
                        '{}: original gene ID to be pulled from strain fasta file'
                        .format(strain_gene_key))

                # # Load into the base strain for comparisons
                ref_gene = self.reference_gempro.genes.get_by_id(
                    strain_gene.id)
                new_id = '{}_{}'.format(strain_gene.id, strain_gempro.id)
                if ref_gene.protein.sequences.has_id(new_id):
                    log.debug(
                        '{}: sequence already loaded into reference model'.
                        format(new_id))
                    continue
                ref_gene.protein.load_manual_sequence(
                    seq=strain_sequences[strain_gene_key],
                    ident=new_id,
                    set_as_representative=False)
                log.debug(
                    '{}: loaded sequence into reference model'.format(new_id))

                # Load into the strain GEM-PRO
                strain_gene.protein.load_manual_sequence(
                    seq=strain_sequences[strain_gene_key],
                    ident=new_id,
                    set_as_representative=True)
                log.debug(
                    '{}: loaded sequence into strain model'.format(new_id))

    def build_strain_specific_models(self, save_models=False):
        """Using the orthologous genes matrix, create and modify the strain specific models based on if orthologous
            genes exist.

        Also store the sequences directly in the reference GEM-PRO protein sequence attribute for the strains.
        """

        if len(self.df_orthology_matrix) == 0:
            raise RuntimeError('Empty orthology matrix')

        # Create an emptied copy of the reference GEM-PRO
        for strain_gempro in tqdm(self.strains):
            log.debug('{}: building strain specific model'.format(
                strain_gempro.id))

            # For each genome, load the metabolic model or genes from the reference GEM-PRO
            logging.disable(logging.WARNING)
            if self._empty_reference_gempro.model:
                strain_gempro.load_cobra_model(
                    self._empty_reference_gempro.model)
            elif self._empty_reference_gempro.genes:
                strain_gempro.genes = [
                    x.id for x in self._empty_reference_gempro.genes
                ]
            logging.disable(logging.NOTSET)

            # Get a list of genes which do not have orthology in the strain
            not_in_strain = self.df_orthology_matrix[pd.isnull(
                self.df_orthology_matrix[strain_gempro.id])][
                    strain_gempro.id].index.tolist()

            # Mark genes non-functional
            self._pare_down_model(strain_gempro=strain_gempro,
                                  genes_to_remove=not_in_strain)

            # Load sequences into the base and strain models
            self._load_strain_sequences(strain_gempro=strain_gempro)

            if save_models:
                cobra.io.save_json_model(
                    model=strain_gempro.model,
                    filename=op.join(self.model_dir,
                                     '{}.json'.format(strain_gempro.id)))
                strain_gempro.save_pickle(
                    op.join(self.model_dir,
                            '{}_gp.pckl'.format(strain_gempro.id)))

        log.info(
            'Created {} new strain-specific models and loaded in sequences'.
            format(len(self.strains)))

    def align_orthologous_genes_pairwise(self, gapopen=10, gapextend=0.5):
        """For each gene in the base strain, run a pairwise alignment for all orthologous gene sequences to it."""
        for ref_gene in tqdm(self.reference_gempro.genes):
            if len(ref_gene.protein.sequences) > 1:
                alignment_dir = op.join(self.sequences_by_gene_dir,
                                        ref_gene.id)
                if not op.exists(alignment_dir):
                    os.mkdir(alignment_dir)
                ref_gene.protein.pairwise_align_sequences_to_representative(
                    gapopen=gapopen,
                    gapextend=gapextend,
                    outdir=alignment_dir,
                    parse=True)

    def align_orthologous_genes_multiple(self):
        """For each gene in the base strain, run a multiple alignment to all orthologous strain genes"""
        pass

    def get_atlas_summary_df(self):
        """Create a single data frame which summarizes all genes per row.

        Returns:
            DataFrame: Pandas DataFrame of the results

        """
        all_info = []
        for g in self.reference_gempro.genes_with_a_representative_sequence:
            info = {}
            info['Gene_ID'] = g.id
            info['Gene_name'] = g.name

            # Protein object
            p = g.protein
            info['Protein_sequences'] = len(p.sequences)
            info['Protein_structures'] = len(p.structures)

            # SeqProp
            rseq = p.representative_sequence
            info['RepSeq_ID'] = rseq.id
            info['RepSeq_sequence_length'] = rseq.seq_len
            info['RepSeq_num_sequence_alignments'] = len([
                x for x in p.sequence_alignments
                if x.annotations['ssbio_type'] == 'seqalign'
            ])
            info['RepSeq_num_structure_alignments'] = len([
                x for x in p.sequence_alignments
                if x.annotations['ssbio_type'] == 'structalign'
            ])

            # SeqRecord annotations (properties calculated that summarize the whole sequence)
            for annotation_name, annotation in rseq.annotations.items():
                info['RepSeq_' + annotation_name] = annotation

            # SeqRecord alignment annotations
            all_num_mutations = []
            all_num_deletions = []
            all_len_deletions = []
            all_num_insertions = []
            all_len_insertions = []
            all_percent_identity = []
            all_percent_similarity = []
            for aln in p.sequence_alignments:
                # Gather the strain speicific stuff
                if '{}_'.format(p.id) not in aln.annotations['b_seq']:
                    continue
                info[aln.annotations['b_seq'].split('{}_'.format(
                    p.id))[1]] = aln.annotations['percent_identity']

                # Gather the percent identities/similarities
                all_percent_identity.append(
                    aln.annotations['percent_identity'])
                all_percent_similarity.append(
                    aln.annotations['percent_similarity'])

                # Gather the number of residues that are mutated (filter for different mutations of same residue)
                num_mutations = len(
                    list(set([x[1] for x in aln.annotations['mutations']])))
                all_num_mutations.append(num_mutations)

                # Gather the number of deletions as well as the length of the deletion
                if not aln.annotations['deletions']:
                    num_deletions = 0
                    len_deletions = [0]
                else:
                    num_deletions = len(aln.annotations['deletions'])
                    len_deletions = [
                        x[1] for x in aln.annotations['deletions']
                    ]
                all_num_deletions.append(num_deletions)
                # Get the total length of the deletion for this one strain
                avg_len_deletions = np.sum(len_deletions)
                all_len_deletions.append(avg_len_deletions)

                # Gather the number of insertions as well as the length of the insertion
                if not aln.annotations['insertions']:
                    num_insertions = 0
                    len_insertions = [0]
                else:
                    num_insertions = len(aln.annotations['insertions'])
                    len_insertions = [
                        x[1] for x in aln.annotations['insertions']
                    ]
                all_num_insertions.append(num_insertions)
                # Get the total length of insertion for this one strain
                avg_len_insertions = np.sum(len_insertions)
                all_len_insertions.append(avg_len_insertions)

            info['ATLAS_mean_num_mutations'] = np.mean(all_num_mutations)
            info['ATLAS_mean_num_deletions'] = np.mean(all_num_deletions)
            info['ATLAS_mean_len_deletions'] = np.mean(all_len_deletions)
            info['ATLAS_mean_num_insertions'] = np.mean(all_num_insertions)
            info['ATLAS_mean_len_insertions'] = np.mean(all_len_insertions)
            info['ATLAS_mean_percent_identity'] = np.mean(all_percent_identity)
            info['ATLAS_mean_percent_similarity'] = np.mean(
                all_percent_similarity)

            # Other mutation analysis
            single, fingerprint = p.sequence_mutation_summary()

            # Mutations that show up in more than 10% of strains
            singles = []
            for k, v in single.items():
                k = [str(x) for x in k]
                if len(v) / len(p.sequence_alignments) >= 0.01:
                    singles.append(
                        ''.join(k)
                    )  # len(v) is the number of strains which have this mutation
            info['ATLAS_popular_mutations'] = ';'.join(singles)

            # Mutation groups that show up in more than 10% of strains
            allfingerprints = []
            for k, v in fingerprint.items():
                if len(v) / len(p.sequence_alignments) >= 0.01:
                    fingerprints = []
                    for m in k:
                        y = [str(x) for x in m]
                        fingerprints.append(''.join(y))
                    allfingerprints.append('-'.join(fingerprints))
            info['ATLAS_popular_mutation_groups'] = ';'.join(allfingerprints)

            # StructProp
            rstruct = p.representative_structure
            if rstruct:
                if rstruct.structure_file:
                    info['RepStruct_ID'] = rstruct.id
                    info['RepStruct_is_experimental'] = rstruct.is_experimental
                    info['RepStruct_description'] = rstruct.description
                    info[
                        'RepStruct_repseq_coverage'] = p.representative_chain_seq_coverage

                    # ChainProp
                    rchain = p.representative_chain
                    info['RepChain_ID'] = rchain

                    # ChainProp SeqRecord annotations
                    rchain_sr = rstruct.chains.get_by_id(rchain).seq_record
                    for annotation_name, annotation in rchain_sr.annotations.items(
                    ):
                        info['RepChain_' + annotation_name] = annotation

            all_info.append(info)

        cols = [
            'Gene_ID', 'Gene_name', 'Protein_sequences', 'Protein_structures',
            'RepSeq_ID', 'RepSeq_sequence_length',
            'RepSeq_num_sequence_alignments',
            'RepSeq_num_structure_alignments', 'RepStruct_ID', 'RepChain_ID',
            'RepStruct_description', 'RepStruct_is_experimental',
            'RepStruct_repseq_coverage', 'ATLAS_mean_percent_identity',
            'ATLAS_mean_percent_similarity', 'ATLAS_mean_num_mutations',
            'ATLAS_popular_mutations', 'ATLAS_popular_mutation_groups',
            'ATLAS_mean_num_deletions', 'ATLAS_mean_num_insertions',
            'ATLAS_mean_len_deletions', 'ATLAS_mean_len_insertions',
            'RepSeq_aromaticity', 'RepSeq_instability_index',
            'RepSeq_isoelectric_point', 'RepSeq_molecular_weight',
            'RepSeq_monoisotopic', 'RepSeq_num_tm_helix-tmhmm',
            'RepSeq_percent_acidic', 'RepSeq_percent_aliphatic',
            'RepSeq_percent_aromatic', 'RepSeq_percent_B-sspro8',
            'RepSeq_percent_basic', 'RepSeq_percent_buried-accpro',
            'RepSeq_percent_buried-accpro20', 'RepSeq_percent_C-sspro',
            'RepSeq_percent_C-sspro8', 'RepSeq_percent_charged',
            'RepSeq_percent_E-sspro', 'RepSeq_percent_E-sspro8',
            'RepSeq_percent_exposed-accpro', 'RepSeq_percent_exposed-accpro20',
            'RepSeq_percent_G-sspro8', 'RepSeq_percent_H-sspro',
            'RepSeq_percent_H-sspro8', 'RepSeq_percent_helix_naive',
            'RepSeq_percent_I-sspro8', 'RepSeq_percent_non-polar',
            'RepSeq_percent_polar', 'RepSeq_percent_S-sspro8',
            'RepSeq_percent_small', 'RepSeq_percent_strand_naive',
            'RepSeq_percent_T-sspro8', 'RepSeq_percent_tiny',
            'RepSeq_percent_turn_naive', 'RepChain_percent_B-dssp',
            'RepChain_percent_C-dssp', 'RepChain_percent_E-dssp',
            'RepChain_percent_G-dssp', 'RepChain_percent_H-dssp',
            'RepChain_percent_I-dssp', 'RepChain_percent_S-dssp',
            'RepChain_percent_T-dssp', 'RepChain_SSBOND-biopython'
        ]
        cols.extend([x.id for x in self.strains])

        df_atlas_summary = pd.DataFrame(all_info, columns=cols)
        # Drop columns that don't have anything in them
        df_atlas_summary.dropna(axis=1, how='all', inplace=True)

        return df_atlas_summary

    def get_atlas_per_gene_mutation_df(self, gene_id):
        """Create a single data frame which summarizes a gene and its mutations.

        Args:
            gene_id (str): Gene ID in the base model

        Returns:
            DataFrame: Pandas DataFrame of the results

        """
        # TODO: also count: number of unique mutations (have to consider position, amino acid change)
        # TODO: keep track of strain with most mutations, least mutations
        # TODO: keep track of strains that conserve the length of the protein, others that extend or truncate it
        # need statistical test for that too (how long is "extended"/"truncated"?)
        # TODO: number of strains with at least 1 mutations
        # TODO: number of strains with <5% mutated, 5-10%, etc

        g = self.reference_gempro.genes.get_by_id(gene_id)

        single, fingerprint = g.protein.sequence_mutation_summary(
            alignment_type='seqalign')

        structure_type_suffix = 'NA'
        appender = []

        for k, strains in single.items():
            # Mutations in the strain
            to_append = {}
            orig_res = k[0]
            resnum = int(k[1])
            mutated_res = k[2]
            num_strains_mutated = len(strains)
            strain_ids = [str(x.split(g.id + '_')[1]) for x in strains]
            to_append['ref_residue'] = orig_res
            to_append['ref_resnum'] = resnum
            to_append['strain_residue'] = mutated_res
            to_append['num_strains_mutated'] = num_strains_mutated
            to_append['strains_mutated'] = ';'.join(strain_ids)
            to_append['at_disulfide_bridge'] = False

            # Residue properties
            origres_props = ssbio.protein.sequence.properties.residues.residue_biochemical_definition(
                orig_res)
            mutres_props = ssbio.protein.sequence.properties.residues.residue_biochemical_definition(
                mutated_res)
            to_append['ref_residue_prop'] = origres_props
            to_append['strain_residue_prop'] = mutres_props

            # Grantham score - score a mutation based on biochemical properties
            grantham_s, grantham_txt = ssbio.protein.sequence.properties.residues.grantham_score(
                orig_res, mutated_res)
            to_append['grantham_score'] = grantham_s
            to_append['grantham_annotation'] = grantham_txt

            # Get all per residue annotations - predicted from sequence and calculated from structure
            to_append.update(
                g.protein.get_residue_annotations(seq_resnum=resnum,
                                                  use_representatives=True))

            # Check structure type
            if g.protein.representative_structure:
                if g.protein.representative_structure.is_experimental:
                    to_append['structure_type'] = 'EXP'
                else:
                    to_append['structure_type'] = 'HOM'

                # At disulfide bond?
                repchain = g.protein.representative_chain
                repchain_annotations = g.protein.representative_structure.chains.get_by_id(
                    repchain).seq_record.annotations
                if 'SSBOND-biopython' in repchain_annotations:
                    structure_resnum = g.protein.map_seqprop_resnums_to_structprop_resnums(
                        resnums=resnum, use_representatives=True)
                    if resnum in structure_resnum:
                        ssbonds = repchain_annotations['SSBOND-biopython']
                        ssbonds_res = []
                        for x in ssbonds:
                            ssbonds_res.append(x[0])
                            ssbonds_res.append(x[1])

                        if structure_resnum in ssbonds_res:
                            to_append['at_disulfide_bridge'] = True

            appender.append(to_append)

        if not appender:
            return pd.DataFrame()

        cols = [
            'ref_residue', 'ref_resnum', 'strain_residue',
            'num_strains_mutated', 'strains_mutated', 'ref_residue_prop',
            'strain_residue_prop', 'grantham_score', 'grantham_annotation',
            'at_disulfide_bridge', 'seq_SS-sspro', 'seq_SS-sspro8',
            'seq_RSA-accpro', 'seq_RSA-accpro20', 'seq_TM-tmhmm',
            'struct_SS-dssp', 'struct_RSA-dssp', 'struct_ASA-dssp',
            'struct_CA_DEPTH-msms', 'struct_RES_DEPTH-msms', 'struct_PHI-dssp',
            'struct_PSI-dssp', 'struct_resnum', 'struct_residue'
            'strains_mutated'
        ]

        df_gene_summary = pd.DataFrame.from_records(appender, columns=cols)

        # Drop columns that don't have anything in them
        df_gene_summary.dropna(axis=1, how='all', inplace=True)

        df_gene_summary.sort_values(by='ref_resnum', inplace=True)
        df_gene_summary = df_gene_summary.set_index('ref_resnum')
        return df_gene_summary

    def download_mutation_images(self):
        # TODO: dunno if this works
        import ipywidgets
        import math

        views = []
        for g in self.reference_gempro.genes:
            if g.protein.representative_structure:
                view = g.protein.view_all_mutations(alignment_type='seqalign',
                                                    grouped=False,
                                                    structure_opacity=0.5,
                                                    opacity_range=(0.6, 1),
                                                    scale_range=(.5, 5))
                view._remote_call("setSize",
                                  target='Widget',
                                  args=['300px', '300px'])
                view.download_image(
                    filename='{}_{}_mutations.png'.format(g.id, g.name))
                views.append(view)

        hboxes = [
            ipywidgets.HBox(views[i * 3:i * 3 + 3])
            for i in range(int(math.ceil(len(views) / 3.0)))
        ]
        vbox = ipywidgets.VBox(hboxes)
        return vbox