def testSeqidLabel(self): seqid = 0.50 label = "FSHB_BOVIN" labeled = refineMSA(FASTA, label=label) unique = uniqueSequences(labeled, seqid) unique[FASTA.getIndex(label)] = True refined = refineMSA(FASTA, label=label, seqid=seqid) assert_array_equal(refined._getArray(), labeled._getArray()[unique])
def testSeqid(self): seqid = 0.50 label = 'FSHB_BOVIN' unique = uniqueSequences(FASTA, seqid) refined = refineMSA(FASTA, seqid=seqid) assert_array_equal(refined._getArray(), FASTA._getArray()[unique])
def testSeqid(self): seqid = 0.50 label = "FSHB_BOVIN" unique = uniqueSequences(FASTA, seqid) refined = refineMSA(FASTA, seqid=seqid) assert_array_equal(refined._getArray(), FASTA._getArray()[unique])
def testLabel(self): label = 'FSHB_BOVIN' index = FASTA.getIndex(label) refined = refineMSA(FASTA, label=label)._getArray() expected = FASTA._getArray().take(FASTA_ALPHA[index].nonzero()[0], 1) assert_array_equal(refined, expected)
def testLabel(self): label = "FSHB_BOVIN" index = FASTA.getIndex(label) refined = refineMSA(FASTA, label=label)._getArray() expected = FASTA._getArray().take(FASTA_ALPHA[index].nonzero()[0], 1) assert_array_equal(refined, expected)
def testRowCol(self): rowocc = 0.9 colocc = 1.0 refined = refineMSA(FASTA, rowocc=rowocc, colocc=colocc)._getArray() rows = FASTA_ALPHA.sum(1) / 112.0 >= rowocc expected = FASTA._getArray()[rows] cols = char.isalpha(expected).sum(0, dtype=float) / expected.shape[0] >= colocc expected = expected.take(cols.nonzero()[0], 1) assert_array_equal(refined, expected)
def testRowCol(self): rowocc = 0.9 colocc = 1.0 refined = refineMSA(FASTA, rowocc=rowocc, colocc=colocc)._getArray() rows = FASTA_ALPHA.sum(1) / 112. >= rowocc expected = FASTA._getArray()[rows] cols = char.isalpha(expected).sum( 0, dtype=float) / expected.shape[0] >= colocc expected = expected.take(cols.nonzero()[0], 1) assert_array_equal(refined, expected)
def evol_refine(msa, **kwargs): from prody import parseMSA, refineMSA, writeMSA, LOGGER from os.path import splitext outname = kwargs.get('outname') if outname is None: outname, ext = splitext(msa) if ext.lower() == '.gz': outname, _ = splitext(msa) outname += '_refined' + ext writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs) LOGGER.info('Refined MSA is written in file: ' + outname)
def evol_refine(msa, **kwargs): import prody from prody import parseMSA, refineMSA, writeMSA, LOGGER from os.path import splitext outname = kwargs.get('outname') if outname is None: outname, ext = splitext(msa) if ext.lower() == '.gz': outname, _ = splitext(msa) outname += '_refined' + ext writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs) LOGGER.info('Refined MSA is written in file: ' + outname)
def evol_refine(msa, **kwargs): import prody from prody import parseMSA, refineMSA, writeMSA, LOGGER from os.path import splitext outname = kwargs.get("outname") if outname is None: outname, ext = splitext(msa) if ext.lower() == ".gz": outname, _ = splitext(msa) outname += "_refined" + ext writeMSA(outname, refineMSA(parseMSA(msa), **kwargs), **kwargs) LOGGER.info("Refined MSA is written in file: " + outname)
def testAll(self): rowocc = 0.9 colocc = 0.9 seqid = 0.98 label = "FSHB_BOVIN" refined = refineMSA(FASTA, label=label, seqid=seqid, rowocc=rowocc, colocc=colocc) index = FASTA.getIndex(label) which = FASTA_ALPHA[index].nonzero()[0] expected = FASTA._getArray().take(which, 1) expected = expected[uniqueSequences(expected, seqid)] expected = expected[calcMSAOccupancy(expected, "row") >= rowocc] which = (calcMSAOccupancy(expected) >= colocc).nonzero()[0] expected = expected.take(which, 1) assert_array_equal(refined._getArray(), expected)
def testAll(self): rowocc = 0.9 colocc = 0.9 seqid = 0.98 label = 'FSHB_BOVIN' refined = refineMSA(FASTA, label=label, seqid=seqid, rowocc=rowocc, colocc=colocc) index = FASTA.getIndex(label) which = FASTA_ALPHA[index].nonzero()[0] expected = FASTA._getArray().take(which, 1) expected = expected[uniqueSequences(expected, seqid)] expected = expected[calcMSAOccupancy(expected, 'row') >= rowocc] which = (calcMSAOccupancy(expected) >= colocc).nonzero()[0] expected = expected.take(which, 1) assert_array_equal(refined._getArray(), expected)
def calcEvolProperties(self, resid='all', refresh=False, folder=None, max_cols=None, max_seqs=25000, **kwargs): ''' Computes Evol properties, i.e. Shannon entropy, Mutual Information and Direct Information, from Pfam Multiple Sequence Alignments, for a given residue. ''' assert type(refresh) is bool # recover Pfam mapping (if not found already) self._searchPfam(refresh=refresh) if resid == 'all': PF_list = self.Pfam.keys() else: # get list of Pfam domains containing resid PF_list = [ k for k in self.Pfam if any([ resid >= int(segment['start']) and resid <= int(segment['end']) for segment in self.Pfam[k]['locations'] ]) ] if len(PF_list) == 0: raise RuntimeError( 'No Pfam domain for resid {}.'.format(resid)) if len(PF_list) > 1: LOGGER.warn('Residue {} is found in multiple '.format(resid) + \ '({}) Pfam domains.'.format(len(PF_list))) if folder is None: folder = SETTINGS.get('rhapsody_local_folder', './') # iterate over Pfam families for PF in PF_list: d = self.Pfam[PF] # skip if properties are pre-computed if not refresh and d.get('mapping') is not None: continue d['mapping'] = None d['ref_MSA'] = None d['entropy'] = np.nan d['MutInfo'] = np.nan d['DirInfo'] = np.nan try: LOGGER.info('Processing {}...'.format(PF)) # fetch & parse MSA # fname = PF + '_full.sth' # fullname = os.path.join(folder, fname) # if not os.path.isfile(fullname): # f = fetchPfamMSA(PF) # os.rename(f, fullname) # msa = parseMSA(fullname, **kwargs) # fetch & parse MSA without saving downloaded MSA f = fetchPfamMSA(PF) msa = parseMSA(f, **kwargs) os.remove(f) # slice MSA to match all segments of the Uniprot sequence sliced_msa, indexes = self._sliceMSA(msa) # if max_cols is not None and sliced_msa.numResidues() > max_cols: # raise Exception('Unable to compute DI: MSA has ' +\ # 'too many columns (max: {}).'.format(max_cols)) # get mapping between Uniprot sequence and Pfam domain d['mapping'] = self._mapUniprot2Pfam(PF, sliced_msa, indexes) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) d['mapping'] = str(e) continue try: # refine MSA ('seqid' param. is set as in PolyPhen-2) rowocc = 0.6 while True: sliced_msa = refineMSA(sliced_msa, rowocc=rowocc) rowocc += 0.02 if sliced_msa.numSequences() <= max_seqs or rowocc >= 1: break ref_msa = refineMSA(sliced_msa, seqid=0.94, **kwargs) d['ref_MSA'] = ref_msa # compute evolutionary properties d['entropy'] = calcShannonEntropy(ref_msa) d['MutInfo'] = buildMutinfoMatrix(ref_msa) # d['DirInfo'] = buildDirectInfoMatrix(ref_msa) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) return {k: self.Pfam[k] for k in PF_list}
def testRowocc(self): refined = refineMSA(FASTA, rowocc=0.9)._getArray() expected = FASTA._getArray()[FASTA_ALPHA.sum(1) / 112.0 >= 0.9, :] assert_array_equal(refined, expected)
def testColocc(self): refined = refineMSA(FASTA, colocc=0.9)._getArray() expected = FASTA._getArray()[:, FASTA_ALPHA.sum(0) / NUMSEQ >= 0.9] assert_array_equal(refined, expected)
def testRowocc(self): refined = refineMSA(FASTA, rowocc=0.9)._getArray() expected = FASTA._getArray()[FASTA_ALPHA.sum(1) / 112. >= 0.9, :] assert_array_equal(refined, expected)