Example #1
0
def process_each_frequency_keras(model_dirname, stft, frequency):
    '''
    Setter method on stft.
    '''
    # 1. Instantiate Neural Network Model
    model_save_fpath = os_path_join(model_dirname, 'k_' + str(frequency),
                                    MODEL_SAVE_FNAME)
    # print('model_save_fpath =', model_save_fpath)
    loaded_model_pipeline = joblib_load(model_save_fpath)

    # 2. Get X_test
    LOGGER.debug('r3.process_each_frequency_keras: stft.shape = {}'.format(
        stft.shape))
    aperture_data = stft[:, :, frequency]  # or stft_frequency

    # 2.1. normalize by L1 norm
    aperture_data_norm = np_linalg_norm(aperture_data, ord=np_inf, axis=1)
    aperture_data /= aperture_data_norm[:, np_newaxis]

    X_test = aperture_data

    # 3. Predict
    y_hat = loaded_model_pipeline.predict(X_test)

    # 4. Postprocess on y_hat
    aperture_data_new = y_hat

    # rescale the data and store new data in stft
    stft[:, :,
         frequency] = aperture_data_new * aperture_data_norm[:, np_newaxis]
Example #2
0
    def __init__(self,
                 stepName,
                 isHomoComplex,
                 savedModelsPath=None,
                 averageLRscores=False):
        '''

      :param stepName: str. Must startswith seq_train or struct or mixed (seq_train, mixed_2, structX, seq_train1... are also valid)
      :param isHomoComplex: boolean. Is the target complex h**o or hetero
      :param savedModelsPath: str. A path to the directory where models have been saved. If None,
                                   it will used the path indicated in Config
      :param averageLRscores: True if Ligand and receptor are the same protein and thus, binding site prediction should be averaged
    '''
        Configuration.__init__(self)

        self.isHomoComplex = isHomoComplex
        self.stepName = stepName
        self.averageLRscores = averageLRscores
        if not savedModelsPath is None:
            self.savedModelsPath = savedModelsPath

        self.model = None
        print(stepName)
        self.savedModelsPath = os.path.join(
            self.savedModelsPath, "h**o" if self.isHomoComplex else "hetero")
        for fname in os.listdir(self.savedModelsPath):
            if fname.endswith(stepName):
                print("Loading model %s %s" %
                      ("h**o" if isHomoComplex else "hetero", fname))
                self.model = joblib_load(
                    os.path.join(self.savedModelsPath, fname))
        assert not self.model is None, "Error, there is no valid model in %s for step %s" % (
            self.savedModelsPath, self.stepName)
Example #3
0
    def load(self, dirname):
        importer = DictImporter()
        (self.tree, self.node_to_class, self.node_to_classes,
         self.class_maps) = load_tree(join(dirname, 'tree'))

        self.models = {}
        models_dirname = join(dirname, 'models')
        with open(join(models_dirname, 'models_fnames.yaml'),
                  'r',
                  encoding='utf-8') as file:
            models_dct = yaml_load(file)
        for node_id, fname in models_dct.items():
            model_path = join(models_dirname, fname)
            self.models[node_id] = load_model(model_path)

        self.encoders = {}
        encoders_dirname = join(dirname, 'encoders')
        with open(join(encoders_dirname, 'encoders_fnames.yaml'),
                  'r',
                  encoding='utf-8') as file:
            encoders_dct = yaml_load(file)
        for node_id, fname in encoders_dct.items():
            encoder_path = join(encoders_dirname, fname)
            self.encoders[node_id] = joblib_load(encoder_path)

        self._fitted = True
Example #4
0
    def loadModel(cls, model, dependencies_filename, joblib=False):
        dependencies = json.load(dependencies_filename)

        if joblib:
            # deserialize by using library joblib
            loadedmodel = joblib_load(model)
        else:
            # deserialize standard Python objects
            loadedmodel = load(open(model, 'rb'))

        return loadedmodel, dependencies
Example #5
0
def getDataForTestFromPrefix(testPrefix, testPath):
  '''
    Load a data file whose name startswith testPrefix and it is contained in testPath.
    Returns a tuple with all data needed to perform predictions and testing

    :param prefix:  str. The prefix of the filename to be loaded. E.g. "1A2K"
    :param filesPath: str. The path where data files are contained
    :return (data_d, data_t, ppiComplex.getLabels(), ppiComplex.getIds())
          data_d: np.array (n,m). A np.array that can be feed to the classifier. Each row represents
                                  a pair of amino acids in direct form (first ligand aa second receptor aa)
          data_l: np.array (n,m). A np.array that can be feed to the classifier. Each row represents
                                  a pair of amino acids in transpose form (first receptor aa second ligand aa)
          ppiComplex.getLabels(): np.array which contains the labels (-1, 1 ) of each row (pair of amino acids)
          ppiComplex.getIds(): pandas.DataFrame whose columns are:
                    chainIdL resIdL resNameL chainIdR resIdR resNameR categ
  '''

  ppiComplex = joblib_load( findFullTestPPIName(testPrefix, testPath))
  isSeqStruct = isinstance(ppiComplex, ComplexSeqStructCodified)
  data_d, data_t = ppiComplex.getData()

  labels= ppiComplex.getLabels()
  ids=  ppiComplex.getIds()
  if SAMPLE_TRAIN_EXAMPLES and testPrefix[:4].islower():

    gc.collect()
    condition= labels>0
    posLabelsIdx= np.where(condition)[0]
    negIdexIdx= np.where(~ condition)[0]

    nToSample= min(MAX_SAMPLING_PAIRS, len(negIdexIdx) )

    if nToSample==MAX_SAMPLING_PAIRS:
      print("Random sampling for %s"%(testPrefix))
      random_state = abs(hash(testPrefix.split("@")[0].split("#s")[0]))  # ensure that all complexes with the same prefix are equally sampled. Required for results average
      random_state = random_state // 2 ** 32 - 1
      random.seed(random_state)
      np.random.seed(random_state)

      negIdexIdx= np.random.choice(negIdexIdx, size= nToSample, replace=False)
      selectIdxs= np.concatenate([posLabelsIdx, negIdexIdx])
      selectIdxs= np.array( [  int(l) for l in selectIdxs if not np.isnan(l) and not np.isnan(labels[l]) ],  dtype=np.int32)
      assert len(selectIdxs)>0, "Error, empty selectIdx"+str(testPrefix)+"\n"+str(ids.head())
      data_d= data_d[selectIdxs,:] if data_d is not None else None
      data_t= data_t[selectIdxs,:] if data_t is not None else None
      labels= labels[selectIdxs]
      ids= ids.iloc[selectIdxs, :].reset_index()
      gc.collect()
      random.seed(None)
      np.random.seed(None)

  return isSeqStruct, (data_d, data_t, labels, ids)
Example #6
0
    def loadPrefixFilesIterator(prefix, filesPath):
        '''
      Load all data files whose name startswith prefix and it is contained in filesPath.
      Works as an iterator

      :param prefix:  str. The prefix of the filename to be loaded. E.x. "1A2K"
      :param filesPath: str. The path where data files are contained
      @yields complex_data: codifyComplexes.ComplexCodified.ComplexCodified class
      
    '''
        for fname in sorted(os.listdir(filesPath)):
            if fname.split(".")[0] == prefix:
                yield joblib_load(os.path.join(filesPath, fname))
Example #7
0
    def loadPrefixFile(prefix, filesPath):
        '''
      Load a data file whose name startswith prefix and it is contained in filesPath

      :param prefix:  str. The prefix of the filename to be loaded. E.x. "1A2K"
      :param filesPath: str. The path where data files are contained
      :return complex_data: codifyComplexes.ComplexCodified.ComplexCodified class
      
    '''
        complexChunks = []
        for fname in sorted(os.listdir(filesPath)):
            if fname.split(".")[0] == prefix:
                complexChunks.append(
                    joblib_load(os.path.join(filesPath, fname)))
        assert len(complexChunks) == 1
        return complexChunks[0]
Example #8
0
def getDataForTestFromPrefix(testPrefix, testPath):
    '''
    Load a data file whose name startswith testPrefix and it is contained in testPath.
    Returns a tuple with all data needed to perform predictions and testing

    :param prefix:  str. The prefix of the filename to be loaded. E.g. "1A2K"
    :param filesPath: str. The path where data files are contained
    :return (data_d, data_t, ppiComplex.getLabels(), ppiComplex.getIds())
          data_d: np.array (n,m). A np.array that can be feed to the classifier. Each row represents
                                  a pair of amino acids in direct form (first ligand aa second receptor aa)
          data_l: np.array (n,m). A np.array that can be feed to the classifier. Each row represents
                                  a pair of amino acids in transpose form (first receptor aa second ligand aa)
          ppiComplex.getLabels(): np.array which contains the labels (-1, 1 ) of each row (pair of amino acids)
          ppiComplex.getIds(): pandas.DataFrame whose columns are:
                    chainIdL resIdL resNameL chainIdR resIdR resNameR categ            
  '''
    for fname in sorted(os.listdir(testPath)):
        if fname.startswith(testPrefix):
            ppiComplex = joblib_load(os.path.join(testPath, fname))
            data_d, data_t = ppiComplex.getData()
            return (data_d, data_t, ppiComplex.getLabels(),
                    ppiComplex.getIds())
Example #9
0
def deserialize(filename, format=DEFAULT_FORMAT):
    if not os.path.exists(filename):
        raise RuntimeError('File %s does not exist' % filename)
    if format & JOBLIB_FORMAT:
        if not has_joblib:
            raise RuntimeError(
                    'Missing library. Format (JOBLIB_FORMAT) not available.')
        return joblib_load(filename)
    if format & BZIP2_FORMAT:
        open_fn = bz2.BZ2File
    else:
        open_fn = open
    with open_fn(filename, 'rb') as f:
        if format & PICKLE_FORMAT:
            return pickle.load(f)
        elif format & YAML_FORMAT:
            if not has_yaml:
                raise RuntimeError(
                        'Missing library. Format (YAML_FORMAT) not available.')
            return yaml.load(f)
        else:
            raise ValueError('Unknown format value.')
Example #10
0
    def __init__(self, stepName, savedModelsPath=None):
        '''

      @param stepName: str. Must startswith seq_train or struct or mixed (seq_train, mixed_2, structX, seq_train1... are also valid)
      @param savedModelsPath: str. A path to the directory where models have been saved. If None, 
                                   it will used the path indicated in Config
    '''
        Configuration.__init__(self)

        self.stepName = stepName
        if not savedModelsPath is None:
            self.savedModelsPath = savedModelsPath

        self.model = None
        print(stepName)
        for fname in os.listdir(self.savedModelsPath):
            if fname.endswith(stepName):
                print("Loading model %s" % (fname))
                self.model = joblib_load(
                    os.path.join(self.savedModelsPath, fname))
        assert not self.model is None, "Error, there is no valid model in %s for step %s" % (
            self.savedModelsPath, self.stepName)
Example #11
0
def trainAndTestOneFold(trainData, testPrefixes, trainSubsetN, testPath, outputPath, verbose=False, ncpu=1):
  '''
    Trains and tests one fold
     
     :param trainData: a numpy array for training with first column labels and the others are features
     :param testPrefixes: str[]. A list that contains prefixes for all complexes to be tested
     :param trainSubsetN: int Tuple. The numerical ids of the training split.
     :param testPath: str. Path to a dir where testing data files are stored
     :param outputPath: str. Path to a dir where predictions will be stored. None if results will not be saved
     :param verbose: boolean. Whether or not print to stdout info
     :param ncpu: int. Number of cpu's to use in parallel
  '''

  testPrefixesNotEvaluated = []
  originalTestPrefixToNewPrefix, __ = getOriginalToActualPrefixs(testPrefixes)
  alreadyComputedPrefixes_and_outnames= []
  for testPrefix in originalTestPrefixToNewPrefix:
    if outputPath is not None:
      outName = getResultsOutname(outputPath, testPrefix, trainSubsetN)
      if verbose and os.path.isfile(outName):
        print("Complex already computed: %s" % (outName))
        alreadyComputedPrefixes_and_outnames.append(  (testPrefix, outName) )
      else:
        testPrefixesNotEvaluated.append((testPrefix, outName))
    else:
      testPrefixesNotEvaluated.append((testPrefix, None))

  modelo = None

  from Config import Configuration
  conf = Configuration()
  modelFname= os.path.join(conf.tmp, hashlib.md5("".join(sorted(testPrefixes))).hexdigest()+str(trainSubsetN)+"bipspi2.pckl")

  resultsForEvaluation_list=[]
  if len(testPrefixesNotEvaluated) > 0 or len(testPrefixes) == 0:
    if verbose:
      print("Testing:", [ x[0] for x in testPrefixesNotEvaluated])
      verboseLevel = 1
    else:
      verboseLevel = 0

    if os.path.exists(modelFname):
      print("Loading classifier")
      modelo= joblib_load(modelFname)
    else:
      print("Training classifier")
      modelo = trainMethod(trainData[:, 1:], trainData[:, 0], verboseLevel=verboseLevel, ncpu=ncpu)
      joblib_save(modelo, modelFname)
    del trainData
    gc.collect()
    if verbose: print("Classifier fitted.")
    
    expectedSize= estimateRequiredMemoryPerComplex(testPrefixesNotEvaluated, testPath)
    freeMem= checkFreeMemory()
    nJobs= int(max(1, min(ncpu, freeMem/expectedSize, len(testPrefixesNotEvaluated))))
    print("Free memory for predictOnePrefix: %s GB. Njobs: %s (%s expected size)"%(freeMem, nJobs, expectedSize))

    resultsForEvaluation_list= Parallel(n_jobs=nJobs)(delayed(predictOnePrefix)(originalTestPrefixToNewPrefix[testPrefix],
                                                                      modelo, outName, testPath)
                                      for testPrefix, outName in testPrefixesNotEvaluated )
    gc.collect()

  expectedSize= estimateRequiredMemoryPerComplex(alreadyComputedPrefixes_and_outnames, testPath)
  freeMem= checkFreeMemory()
  nJobs= int(max(1, min(ncpu, freeMem/expectedSize, len(alreadyComputedPrefixes_and_outnames))))     
  resultsForEvaluation_list+= Parallel(n_jobs=nJobs)(delayed(loadExistingResults)( testPrefix, outName,)
                                    for testPrefix, outName in alreadyComputedPrefixes_and_outnames )
    
  if len(resultsForEvaluation_list)>0:
    freeMem = checkFreeMemory()
    totMem= getTotalMemory()
    usedMem= totMem-freeMem
    nJobs = int(max(1, min(ncpu, freeMem / (usedMem/(1+len(resultsForEvaluation_list))))))
    print("Free memory for evaluateOneResultObj: %s GB. Njobs: %s" % (freeMem, nJobs))
    Parallel(n_jobs=nJobs)(delayed(evaluateOneResultObj)(testPrefix, resultObj, False)
                           for testPrefix, resultObj in resultsForEvaluation_list)
    finalResults= zip(*resultsForEvaluation_list)[1]
  else:
    finalResults=[]
  del resultsForEvaluation_list
  tryToRemove(modelFname)
  return finalResults, modelo
Example #12
0
def load_model_pipeline():
    return joblib_load("model/model_v1.joblib")