def block2pssm(self, block_data, seq): pssm_info = [] for i in range(len(block_data)): score_dict = {} for a in self.alphabet: score_dict[a] = block_data[i][self.order[a]] pssm_info.append((seq[i], score_dict)) return PSSM(pssm_info)
def from_sequences( cls, sequences, name="unnamed", pseudocounts="jaspar", threshold=None, relative_threshold=None, ): """Return a PSSM pattern computed from same-length sequences. Parameters ---------- sequences A list of same-length sequences name Name to give to the pattern (will appear in reports etc.) pseudocounts Either a dict {"A": 0.01, "T": ...} or "jaspar" for automatic pseudocounts from the Biopython.motifs.jaspar module (recommended), or None for no pseudocounts at all (not recommended!) threshold locations of the sequence with a PSSM score above this value will be considered matches. For convenience, a relative_threshold can be given instead. relative_threshold Value between 0 and 1 from which the threshold will be auto-computed. 0 means "match everything", 1 means "only match the one (or several) sequence(s) with the absolute highest possible score". """ sequences = [Seq(s) for s in sequences] motif = motifs.create(sequences) cls.apply_pseudocounts(motif, pseudocounts) pssm = PSSM(motif.pssm) pssm.name = name return MotifPssmPattern( pssm=pssm, threshold=threshold, relative_threshold=relative_threshold, )
def parse_pssm(fname): pssm = [] with open(fname) as f: for line in f.readlines(): token = line.rstrip('\r\n').split() if len(token) == 0: continue if re.match(r'\d+', token[0]): pssm.append( (token[1], dict((x, int(y)) for x, y in zip(AA, token[2:22])))) return PSSM(pssm)