import os
import re
import numpy as np
from sklearn.metrics import roc_curve

from protfun.models import get_hidden_activations, get_best_params
from protfun.utils import save_pickle, load_pickle
from protfun.visualizer.molview import MoleculeView
from protfun.visualizer.progressview import ProgressView
from protfun.visualizer.roc_view import ROCView, micro_macro_roc
from protfun.utils.log import get_logger

log = get_logger("experiment_visualizer")


def create_history_plots(config, model_name, checkpoint=None, until=None):
    """
    Creates training history diagrams for a desired model that has already been trained.

    :param config: a config dictionary, containing the contents of the config.yaml for the trained
        model. You can load it from file with protfun.config.get_config(file_path)
    :param model_name: name (model id) of the model to create diagrams for. Corresponds to the name
        of the model directory under <data_dir>/models
    :param checkpoint: (optional) specify a mini-batch at which you want a vertical line visualize
        to represent when the model was check-pointed
    :param until: (optional) restrict the number of mini-batches shown in the progress diagram
    """
    model_dir = os.path.join(config["data"]["dir"], "models", model_name)

    hisotry_files = [
        f for f in os.listdir(model_dir) if f.startswith("train_history_ep")
import os
from protfun.utils.log import get_logger

log = get_logger("protein_fetcher")


class EnzymeFetcher(object):
    """
    EnzymeFetcher queries PDB ids for the enzymes EC2PDB data set, extracting them
    from the EC2PDB website based on desired EC categories.
    """
    def __init__(self,
                 categories,
                 excluded_categories=list(),
                 enzyme_dir=None):
        """
        :param categories: which enzyme categories to download
        :param excluded_categories: which enzyme categories to exclude
        :param enzyme_dir: where to download the enzymes
        """
        self.enzyme_dir = enzyme_dir
        self.excluded_categories = excluded_categories
        self.leaf_categories = list()

        log.info("Evaluating the total categorical hierarchy...")
        for cat in set(categories) - set(excluded_categories):
            self._find_leaf_categories(cat)

        self.fetched_prot_codes = dict()

    def _find_leaf_categories(self, cat):
import os
import ntpath
import re
from glob import glob
import itertools

from protfun.utils.log import get_logger

log = get_logger("validations")


class EnzymeValidator(object):
    """
    EnzymeValidator has the task to validate the correctness and completeness of the essential data management steps,
    e.g. downloading and splitting. This should help finding bugs.
    """
    def __init__(self, enz_classes=None, dirs=None):
        self.enzyme_classes = enz_classes
        self.dirs = dirs

    def check_naming(self, classes):
        """
        checks if the EC classes listed comply with the naming convention, e.g. 1.1.1.1

        :param classes: a list of the EC classes
        :return:
        """
        return sum([
            not bool(re.compile(r'[^0-9.]').search(cls)) for cls in classes
        ]) == len(classes)
Beispiel #4
0
import numpy as np
import pickle
import os
import matplotlib

matplotlib.use('Agg')
import seaborn as sns

from protfun.utils.log import get_logger

log = get_logger("progress_view")

sns.set_style("whitegrid")
colors = ['#1b9e77', '#d95f02', '#7570b3', '#e7298a']
sns.set_palette(colors)

text = {
    'titles': {
        'loss': 'Loss progression during training',
        'accuracy': 'Accuracy progression during training',
        'per_class_accs': 'Accuracy progression per class during training'
    },
    'y_labels': {
        'loss': 'Loss',
        'accuracy': 'Accuracy',
        'per_class_accs': 'Accuracy'
    }
}


class ProgressView(object):
import shutil
import abc
import numpy as np
import os

import protfun.data_management.preprocess as prep
from protfun.data_management.label_factory import LabelFactory
from protfun.data_management.validation import EnzymeValidator
from protfun.utils import save_pickle, load_pickle, construct_hierarchical_tree
from protfun.utils.log import get_logger

log = get_logger("data_manager")


class DataManager(object):
    """
    DataManager is a parent class for EnzymeDataManager which stores all data directories and
    implements a *naive* split strategy described below.
    """
    __metaclass__ = abc.ABCMeta

    def __init__(self, data_dir,
                 force_download=False, force_process=False, force_split=False,
                 percentage_test=10, percentage_val=20):
        """
        :param data_dir: the path to the root data directory
        :param force_download: forces the downloading of the enzymes
        :param force_process: forces the pre-processing steps
        :param force_split: forces the splitting of the data into training ,validation and test sets
        :param percentage_test: the portion in % of the test data
        :param percentage_val: the portion in % of the validation data
Beispiel #6
0
import numpy as np
import os

from protfun.utils.log import get_logger

log = get_logger("molview")


class MoleculeView(object):
    """
    MoleculeView visualizes the generated 3D input maps of electron density and electrostatic potential
    """
    def __init__(self, data_dir, data=None, info=None):
        """

     Parameters:
        - data :
        - info : a dictionary with keys "id", "name" (and more).
        :param data_dir: a directory where the figures should be stored
        :param data: a dictionary with keys "density" and "potential" containing 3d numpy arrays
        with the molecule's electron density and electron potential distribution.
        :param info: additional info to be printed in the title/legend such as the molecule PDB code
        """
        self.data_dir = data_dir
        self.figures_dir = os.path.join(self.data_dir, "figures")
        if not os.path.exists(self.figures_dir):
            os.makedirs(self.figures_dir)

        if info is not None:
            self.molecule_name = info["name"]
import lasagne
import numpy as np
import theano
import theano.tensor.nlinalg
import theano.tensor as T

from protfun.visualizer.molview import MoleculeView
from protfun.utils.log import get_logger

log = get_logger("molmap_layer")

floatX = theano.config.floatX
intX = np.int32


class MoleculeMapLayer(lasagne.layers.MergeLayer):
    """
    This is a Lasagne layer to calculate 3D grid maps (electron density estimated from VdW radii)
    of molecules. (using Theano, i.e. on the GPU).

    Usage::
        >>> from lasagne.layers import InputLayer
        >>> minibatch_size = 8
        >>> dummy_coords_input = InputLayer(shape=(minibatch_size, None, None))
        >>> dummy_vdwradii_input = InputLayer(shape=(minibatch_size, None))
        >>> dummy_natoms_input = InputLayer(shape=(minibatch_size,))
        >>> molmap_layer = MoleculeMapLayer(
        >>>    incomings=[dummy_coords_input, dummy_vdwradii_input, dummy_natoms_input],
        >>>    minibatch_size=minibatch_size, rotate=True)

    """
Beispiel #8
0
import cPickle
import os

from protfun.utils.log import get_logger

log = get_logger("data_utils")


def save_pickle(file_path, data):
    """
    Saves a pickle with the provided data.

    Usage::
        >>> # single save
        >>> train_prot_codes = dict()
        >>> save_pickle("data/train_prot_codes.pickle", train_prot_codes)

        >>> # multi save
        >>> train_prot_codes, test_prot_codes = dict(), dict()
        >>> save_pickle(["data/train_prot_codes.pickle", "data/test_prot_codes.pickle"],
        >>>             [train_prot_codes, test_prot_codes])
    :param file_path: path (or paths) of the file(s) to be saved
    :param data: data object (or list of data objects) that will be saved
    :raises: ValueError if the number of paths and data objects do not match
    """
    if isinstance(data, list) and isinstance(file_path, list):
        if len(data) == len(file_path):
            for path, dat in zip(file_path, data):
                with open(path, 'wb') as f:
                    cPickle.dump(dat, f)
        else:
Beispiel #9
0
import numpy as np
import theano
import theano.tensor as T
import lasagne

from protfun.layers.grid_rotate_layer import GridRotationLayer
from protfun.utils.log import get_logger

log = get_logger("joint_class_model")
floatX = theano.config.floatX
intX = np.int32


class JointClassModel(object):
    """
    Abstract class, not meant to be instantiated.

    JointClassModel is the standard generic multi-class classifier model, that uses a single softmax
    in its output layer. Samples can be thus members of only one single class.
    """

    def __init__(self, name, n_classes, learning_rate):
        """
        :param name: name of the model, used by external mechanisms for saving training history etc.
        :param n_classes: total number of different classes for the classification.
        :param learning_rate: initial learning rate
        """
        self.name = name
        self.n_classes = n_classes
        self.learning_rate = learning_rate
import abc
import numpy as np
import theano
from os import path

from protfun.utils import construct_hierarchical_tree
from protfun.utils.log import get_logger

log = get_logger("data_feed")
floatX = theano.config.floatX
intX = np.int32


class DataFeeder(object):
    """
     DataFeeder is an abstract class (not meant to be instantiated).
     All data feeders implement iterate_{train, test, val}_data() and
     get_{train, test, val}_data() methods. The iterate methods are mini-
     batch generators (you can use for loops on them to get mini-batches),
     whereas the get_ methods return the whole data sets.

     Thus, the data feeders are meant to be used during training / testing
     of models to provide the data that must be fed into them.

     Usage:
        >>> dummy_feeder = DataFeeder(...)
        >>> for train_minibatch in dummy_feeder.iterate_train_data():
        >>>     # do something to the minibatch, e.g. feed forward into
        >>>     # your model

     """
import numpy as np
import os
import lasagne
import cPickle

from protfun.utils.log import get_logger

log = get_logger("model_monitor")


class ModelMonitor(object):
    """
    Monitors the model during training and testing. Logs the error and accuracy values
    and can creates checkpoints of the model parameters (triggered in the ModelTrainer whenever the
    mean validation error is being improved).

    Optionally dumps the model status on KeyInterrupt.
    """

    def __init__(self, outputs, data_dir, name):
        """
        :param outputs: lasagne output layers of the neural network of the monitored model.
            Used to checkpoint the model parameters during training.
        :param data_dir: data directory under which the monitor will create a folder for the
            currently monitored model (or use an existing one, if already present). The path is:
            data_dir/models/<model_name>
        :param name: name of the currently monitored model
        """
        self.network_outputs = outputs
        self.name = name
        self.path_to_model_dir = os.path.join(data_dir, "models", self.name)
Beispiel #12
0
import numpy as np
import theano
import lasagne
from theano import tensor as T

from protfun.utils.log import get_logger

log = get_logger("grid_rotate_layer")
floatX = theano.config.floatX


class GridRotationLayer(lasagne.layers.Layer):
    """
    GridRotationLayer is a dynamic 3D augmentation layer that can be used in the beginning of
    any neural network. It performs random rotations and (small) translations in 3D space on the
    fly.

    Usage::
        >>> from lasagne.layers import InputLayer
        >>> minibatch_size = 8
        >>> n_channels = 2
        >>> side = 32
        >>> input_layer = InputLayer(shape=(minibatch_size, n_channels, side, side, side),
        >>>                          input_var=input_grid)
        >>> # apply the rotation layer
        >>> rotation_layer = GridRotationLayer(incoming=input_layer, grid_side=side,
        >>>                                    n_channels=n_channels, interpolation='linear')
    """
    min_dist_from_border = 5

    def __init__(self,
Beispiel #13
0
import re
import numpy as np

from protfun.utils import save_pickle
from protfun.config import save_config
from protfun.data_management.data_feed import EnzymesGridFeeder
from protfun.data_management.data_manager import EnzymeDataManager
from protfun.models import GridsDisjointClassifier
from protfun.models.model_monitor import ModelMonitor
from protfun.networks import get_network
from protfun.utils.np_utils import pp_array
from protfun.visualizer.netview import NetworkView
from protfun.visualizer.progressview import ProgressView
from protfun.utils.log import get_logger

log = get_logger("model_trainer")


class ModelTrainer(object):
    """
    ModelTrainer is responsible for the training & testing of a model. It supervises the training
    procedure, saves information about the training into files and can also validate & test
    a trained model in the end.

    It takes a data feeder as an argument in the constructor, and then fetches mini-batches from the
    data feeder during each training iteration and forwards them to the model under training.
    The model to be trained is provided as a parameter to the constructor of the model trainer.

    Usage::
        >>> model = GridsDisjointClassifier(...)
        >>> feeder = EnzymesGridFeeder(...)
Beispiel #14
0
import StringIO
import theano
import lasagne
import cPickle
import itertools

import prody as pd
import rdkit.Chem as Chem
import rdkit.Chem.rdPartialCharges as rdPC
import rdkit.Chem.rdMolTransforms as rdMT
import rdkit.Chem.rdmolops as rdMO

from protfun.layers import MoleculeMapLayer
from protfun.utils.log import get_logger

log = get_logger("preprocessor")
floatX = theano.config.floatX
intX = np.int32
# number of sidechain channels (20 amino, all, nonhydro, hydro, backbone)
CNS = 24


class DataProcessor(object):
    __metaclass__ = abc.ABCMeta

    def __init__(self, from_dir, target_dir):
        self.from_dir = from_dir
        self.target_dir = target_dir

    @abc.abstractmethod
    def process(self):