def __init__(self, common_positions, numbering_schemes, segments, difference_matrix, protein_set, cutoff=40): self.cutoff = cutoff self.common_gn = common_positions self.schemes = numbering_schemes self.segments = segments self.diff_matrix = difference_matrix self.signature_matrix_filtered = OrderedDict() self.signature_consensus = OrderedDict() self.protein_set = protein_set self.relevant_gn = OrderedDict([(x[0], OrderedDict()) for x in self.schemes]) self.relevant_segments = OrderedDict() self.scored_proteins = [] self.protein_report = OrderedDict() self.protein_signatures = OrderedDict() self.find_relevant_gns() self.residue_to_feat = dict([(x, set()) for x in AMINO_ACIDS.keys()]) for fidx, feat in enumerate(AMINO_ACID_GROUPS.items()): for res in feat[1].split(','): self.residue_to_feat[res].add(fidx)
def _convert_feature_stats(self, fstats, aln): tmp_fstats = [] for row in range(len(AMINO_ACID_GROUPS.keys())): tmp_row = [] for segment in self.common_segments: tmp_row.append([[ str(x), str(int(x / 10)) if x != 0 else -1, ] for x in fstats[segment][row]]) tmp_fstats.append(tmp_row) aln.feature_stats = tmp_fstats
def __init__(self): self.aln_pos = Alignment() self.aln_neg = Alignment() self.features_normalized_pos = OrderedDict() self.features_normalized_neg = OrderedDict() self.features_frequency_difference = OrderedDict() self.features_frequency_diff_display = [] self.freq_cutoff = 30 self.common_gn = OrderedDict() self.feature_preference = prepare_aa_group_preference() self.group_lengths = dict([ (x, len(y)) for x, y in enumerate(AMINO_ACID_GROUPS.values()) ]) self.default_column = np.array([ ((y.startswith('-') or y == '_') and y != '--' and not y.startswith('-_') and 100) or 0 for y in AMINO_ACID_GROUPS.keys() ])
def score_protein(self, pcf): prot_score = 0.0 consensus_match = OrderedDict([(x, []) for x in self.relevant_segments]) for segment in self.relevant_segments: tmp = [] signature_map = np.absolute( self.signature_matrix_filtered[segment]).argmax(axis=0) resi = Residue.objects.filter( protein_segment__slug=segment, protein_conformation=pcf, generic_number__label__in=self.relevant_gn[self.schemes[0][0]] [segment].keys(), ) for idx, pos in enumerate( self.relevant_gn[self.schemes[0][0]][segment].keys()): feat = signature_map[idx] feat_abr = list(AMINO_ACID_GROUPS.keys())[feat] feat_name = list(AMINO_ACID_GROUP_NAMES.values())[feat] val = self.signature_matrix_filtered[segment][feat][idx] try: res = resi.get(generic_number__label=pos) r_name = res.amino_acid if res.amino_acid != 'Gap' else '_' if feat in self.residue_to_feat[res.amino_acid]: prot_score += val tmp.append([ feat_abr, feat_name, val, "green", res.amino_acid, pos ]) if val > 0 else tmp.append([ feat_abr, feat_name, val, "white", res.amino_acid, pos ]) else: #David doesn't want the negative values in the score # prot_score -= val tmp.append([ feat_abr, feat_name, val, "red", res.amino_acid, pos ]) if val > 0 else tmp.append([ feat_abr, feat_name, val, "white", res.amino_acid, pos ]) except (exceptions.ObjectDoesNotExist, exceptions.MultipleObjectsReturned): prot_score -= val tmp.append([feat_abr, feat_name, val, "red", '_', pos ]) if val > 0 else tmp.append([ feat_abr, feat_name, val, "white", '_', pos ]) consensus_match[segment] = tmp return (prot_score / 100, consensus_match)
def find_relevant_gns(self): """ Find the set of generic residue positions meeting the cutoff. """ matrix_consensus = OrderedDict() for segment in self.segments: segment_consensus = [] signature_map = self.diff_matrix[segment].argmax(axis=0) # Update mapping to prefer features with fewer amino acids signature_map = self._assign_preferred_features( signature_map, segment, self.diff_matrix) for col, pos in enumerate(list(signature_map)): if self.diff_matrix[segment][pos][col] >= self.cutoff: segment_consensus.append(self.diff_matrix[segment][:, col]) for scheme in self.schemes: gnum = list( self.common_gn[scheme[0]][segment].items())[col] try: self.relevant_gn[scheme[0]][segment][ gnum[0]] = gnum[1] except KeyError: self.relevant_gn[ scheme[0]][segment] = OrderedDict() self.relevant_gn[scheme[0]][segment][ gnum[0]] = gnum[1] segment_consensus = np.array(segment_consensus).T if segment_consensus.shape != (0, ): matrix_consensus[segment] = segment_consensus self.signature_matrix_filtered = matrix_consensus self.relevant_segments = OrderedDict([ (x[0], self.relevant_gn[self.schemes[0][0]][x[0]].keys()) for x in self.signature_matrix_filtered.items() ]) signature = OrderedDict([(x[0], []) for x in matrix_consensus.items()]) for segment in self.relevant_segments: signature_map = self.signature_matrix_filtered[segment].argmax( axis=0) signature_map = self._assign_preferred_features( signature_map, segment, self.signature_matrix_filtered) tmp = np.array(self.signature_matrix_filtered[segment]) for col, pos in enumerate(list(signature_map)): signature[segment].append([ list(AMINO_ACID_GROUPS.keys())[pos], list(AMINO_ACID_GROUP_NAMES.values())[pos], tmp[pos][col], int(tmp[pos][col] / 20) + 5 ]) self.signature_consensus = signature
def find_relevant_gns(self): matrix_consensus = OrderedDict() for segment in self.segments: print(segment) segment_consensus = [] signature_map = np.absolute( self.diff_matrix[segment]).argmax(axis=0) for col, pos in enumerate(list(signature_map)): if abs(self.diff_matrix[segment][pos][col]) > self.cutoff: segment_consensus.append(self.diff_matrix[segment][:, col]) for scheme in self.schemes: gnum = list( self.common_gn[scheme[0]][segment].items())[col] try: self.relevant_gn[scheme[0]][segment][ gnum[0]] = gnum[1] except: self.relevant_gn[ scheme[0]][segment] = OrderedDict() self.relevant_gn[scheme[0]][segment][ gnum[0]] = gnum[1] segment_consensus = np.array(segment_consensus).T if segment_consensus != []: matrix_consensus[segment] = segment_consensus self.signature_matrix_filtered = matrix_consensus self.relevant_segments = OrderedDict([ (x[0], self.relevant_gn[self.schemes[0][0]][x[0]].keys()) for x in self.signature_matrix_filtered.items() ]) signature = OrderedDict([(x[0], []) for x in matrix_consensus.items()]) for segment in self.relevant_segments: signature_map = np.absolute( self.signature_matrix_filtered[segment]).argmax(axis=0) tmp = np.array(self.signature_matrix_filtered[segment]) for col, pos in enumerate(list(signature_map)): signature[segment].append([ list(AMINO_ACID_GROUPS.keys())[pos], list(AMINO_ACID_GROUP_NAMES.values())[pos], tmp[pos][col], int(tmp[pos][col] / 20) + 5 ]) self.signature_consensus = signature
def prepare_aa_group_preference(): pref_dict = {} lengths = {} for row, group in enumerate(AMINO_ACID_GROUPS.items()): tmp_len = len(group[1]) try: lengths[tmp_len].append(row) except KeyError: lengths[tmp_len] = [ row, ] l_heap = sorted(lengths.keys()) while l_heap: tmp = l_heap.pop() for feat_row in lengths[tmp]: pref_dict[feat_row] = [] for pref_feat in l_heap: pref_dict[feat_row].extend(lengths[pref_feat]) return pref_dict
def calculate_signature(self): """ Calculates the feature frequency difference between two protein sets. Generates the full differential matrix as well as maximum difference for a position (for scatter plot). """ for sid, segment in enumerate(self.aln_neg.segments): self.features_normalized_pos[segment] = np.array( [[x[0] for x in feat[sid]] for feat in self.aln_pos.feature_stats], dtype='int') self.features_normalized_neg[segment] = np.array( [[x[0] for x in feat[sid]] for feat in self.aln_neg.feature_stats], dtype='int') for segment in self.aln_neg.segments: #TODO: get the correct default numering scheme from settings for idx, res in enumerate( self.common_gn[self.common_schemes[0][0]][segment].keys()): if res not in self.aln_pos.generic_numbers[ self.common_schemes[0][0]][segment].keys(): self.features_normalized_pos[segment] = np.insert( self.features_normalized_pos[segment], idx, 0, axis=1) # Set 100% occurence for a gap feature self.features_normalized_pos[segment][-1, idx] = 100 elif res not in self.aln_neg.generic_numbers[ self.common_schemes[0][0]][segment].keys(): self.features_normalized_neg[segment] = np.insert( self.features_normalized_neg[segment], idx, 0, axis=1) # Set 100% occurence for a gap feature self.features_normalized_neg[segment][-1, idx] = 100 # now the difference self.features_frequency_difference[segment] = np.subtract( self.features_normalized_pos[segment], self.features_normalized_neg[segment]) self._convert_feature_stats(self.features_normalized_pos, self.aln_pos) self._convert_feature_stats(self.features_normalized_neg, self.aln_neg) # Version with display data for row in range(len(AMINO_ACID_GROUPS.keys())): tmp_row = [] for segment in self.aln_neg.segments: #first item is the real value, # second is the assignmnent of color (via css) # 0 - red, 5 - yellow, 10 - green #third item is a tooltip tmp_row.append([[ x, int(x / 20) + 5, "{} - {}".format( self.features_normalized_pos[segment][row][y], self.features_normalized_neg[segment][row][y]) ] for y, x in enumerate( self.features_frequency_difference[segment][row])]) self.features_frequency_diff_display.append(tmp_row) self.signature = OrderedDict([(x, []) for x in self.aln_neg.segments]) for segment in self.aln_neg.segments: tmp = np.array(self.features_frequency_difference[segment]) signature_map = np.absolute(tmp).argmax(axis=0) self.signature[segment] = [] for col, pos in enumerate(list(signature_map)): self.signature[segment].append([ list(AMINO_ACID_GROUPS.keys())[pos], list(AMINO_ACID_GROUP_NAMES.values())[pos], self.features_frequency_difference[segment][pos][col], int(self.features_frequency_difference[segment][pos][col] / 20) + 5 ]) features_pos = OrderedDict() features_neg = OrderedDict() self.features_consensus_pos = OrderedDict([ (x, []) for x in self.aln_neg.segments ]) self.features_consensus_neg = OrderedDict([ (x, []) for x in self.aln_neg.segments ]) for sid, segment in enumerate(self.aln_neg.segments): features_pos[segment] = np.array( [[x[0] for x in feat[sid]] for feat in self.aln_pos.feature_stats], dtype='int') features_neg[segment] = np.array( [[x[0] for x in feat[sid]] for feat in self.aln_neg.feature_stats], dtype='int') features_cons_pos = np.absolute( features_pos[segment]).argmax(axis=0) features_cons_neg = np.absolute( features_neg[segment]).argmax(axis=0) for col, pos in enumerate(list(features_cons_pos)): self.features_consensus_pos[segment].append([ list(AMINO_ACID_GROUPS.keys())[pos], list(AMINO_ACID_GROUP_NAMES.values())[pos], features_pos[segment][pos][col], int(features_pos[segment][pos][col] / 20) + 5 ]) for col, pos in enumerate(list(features_cons_neg)): self.features_consensus_neg[segment].append([ list(AMINO_ACID_GROUPS.keys())[pos], list(AMINO_ACID_GROUP_NAMES.values())[pos], features_neg[segment][pos][col], int(features_neg[segment][pos][col] / 20) + 5 ]) self._convert_feature_stats(self.features_normalized_pos, self.aln_pos) self._convert_feature_stats(self.features_normalized_neg, self.aln_neg)
def score_protein(self, pcf): prot_score = 0.0 norm = 0.0 consensus_match = OrderedDict([(x, []) for x in self.relevant_segments]) relevant_gns_total = [] for segment in self.relevant_segments: for idx, pos in enumerate( self.relevant_gn[self.schemes[0][0]][segment].keys()): relevant_gns_total.append(pos) resi = Residue.objects.filter( protein_conformation=pcf, generic_number__label__in=relevant_gns_total).prefetch_related( 'generic_number') resi_dict = {} for r in resi: if r.generic_number: resi_dict[r.generic_number.label] = r for segment in self.relevant_segments: tmp = [] signature_map = self.signature_matrix_filtered[segment].argmax( axis=0) signature_map = self._assign_preferred_features( signature_map, segment, self.signature_matrix_filtered) norm += np.sum( np.amax(self.signature_matrix_filtered[segment], axis=0)) for idx, pos in enumerate( self.relevant_gn[self.schemes[0][0]][segment].keys()): feat = signature_map[idx] feat_abr = list(AMINO_ACID_GROUPS.keys())[feat] feat_name = list(AMINO_ACID_GROUP_NAMES.values())[feat] val = self.signature_matrix_filtered[segment][feat][idx] if pos in resi_dict: res = resi_dict[pos] if feat in self.residue_to_feat[res.amino_acid]: if val > 0: prot_score += val tmp.append([ feat_abr, feat_name, val, "green", res.amino_acid, pos ]) if val > 0 else tmp.append([ feat_abr, feat_name, val, "white", res.amino_acid, pos ]) else: #David doesn't want the negative values in the score # prot_score -= val tmp.append([ feat_abr, feat_name, val, "red", res.amino_acid, pos ]) if val > 0 else tmp.append([ feat_abr, feat_name, val, "white", res.amino_acid, pos ]) else: if feat_name == 'Gap': tmp.append([ feat_abr, feat_name, val, "green", '_', pos ]) if val > 0 else tmp.append( [feat_abr, feat_name, val, "white", '_', pos]) prot_score += val else: #David doesn't want the negative values in the score #prot_score -= val tmp.append([ feat_abr, feat_name, val, "red", '_', pos ]) if val > 0 else tmp.append( [feat_abr, feat_name, val, "white", '_', pos]) consensus_match[segment] = tmp return (prot_score / 100, prot_score / norm * 100, consensus_match)