Ejemplo n.º 1
0
    def _convert_feature_stats(self, fstats, aln):

        tmp_fstats = []
        for row in range(len(AMINO_ACID_GROUPS.keys())):
            tmp_row = []
            for segment in self.common_segments:
                tmp_row.append([[
                    str(x),
                    str(int(x / 10)) if x != 0 else -1,
                ] for x in fstats[segment][row]])
            tmp_fstats.append(tmp_row)
        aln.feature_stats = tmp_fstats
Ejemplo n.º 2
0
 def score_protein(self, pcf):
     prot_score = 0.0
     consensus_match = OrderedDict([(x, [])
                                    for x in self.relevant_segments])
     for segment in self.relevant_segments:
         tmp = []
         signature_map = np.absolute(
             self.signature_matrix_filtered[segment]).argmax(axis=0)
         resi = Residue.objects.filter(
             protein_segment__slug=segment,
             protein_conformation=pcf,
             generic_number__label__in=self.relevant_gn[self.schemes[0][0]]
             [segment].keys(),
         )
         for idx, pos in enumerate(
                 self.relevant_gn[self.schemes[0][0]][segment].keys()):
             feat = signature_map[idx]
             feat_abr = list(AMINO_ACID_GROUPS.keys())[feat]
             feat_name = list(AMINO_ACID_GROUP_NAMES.values())[feat]
             val = self.signature_matrix_filtered[segment][feat][idx]
             try:
                 res = resi.get(generic_number__label=pos)
                 r_name = res.amino_acid if res.amino_acid != 'Gap' else '_'
                 if feat in self.residue_to_feat[res.amino_acid]:
                     prot_score += val
                     tmp.append([
                         feat_abr, feat_name, val, "green", res.amino_acid,
                         pos
                     ]) if val > 0 else tmp.append([
                         feat_abr, feat_name, val, "white", res.amino_acid,
                         pos
                     ])
                 else:
                     #David doesn't want the negative values in the score
                     # prot_score -= val
                     tmp.append([
                         feat_abr, feat_name, val, "red", res.amino_acid,
                         pos
                     ]) if val > 0 else tmp.append([
                         feat_abr, feat_name, val, "white", res.amino_acid,
                         pos
                     ])
             except (exceptions.ObjectDoesNotExist,
                     exceptions.MultipleObjectsReturned):
                 prot_score -= val
                 tmp.append([feat_abr, feat_name, val, "red", '_', pos
                             ]) if val > 0 else tmp.append([
                                 feat_abr, feat_name, val, "white", '_', pos
                             ])
         consensus_match[segment] = tmp
     return (prot_score / 100, consensus_match)
Ejemplo n.º 3
0
    def find_relevant_gns(self):
        """
        Find the set of generic residue positions meeting the cutoff.
        """

        matrix_consensus = OrderedDict()
        for segment in self.segments:
            segment_consensus = []
            signature_map = self.diff_matrix[segment].argmax(axis=0)
            # Update mapping to prefer features with fewer amino acids
            signature_map = self._assign_preferred_features(
                signature_map, segment, self.diff_matrix)
            for col, pos in enumerate(list(signature_map)):
                if self.diff_matrix[segment][pos][col] >= self.cutoff:
                    segment_consensus.append(self.diff_matrix[segment][:, col])
                    for scheme in self.schemes:
                        gnum = list(
                            self.common_gn[scheme[0]][segment].items())[col]
                        try:
                            self.relevant_gn[scheme[0]][segment][
                                gnum[0]] = gnum[1]
                        except KeyError:
                            self.relevant_gn[
                                scheme[0]][segment] = OrderedDict()
                            self.relevant_gn[scheme[0]][segment][
                                gnum[0]] = gnum[1]
            segment_consensus = np.array(segment_consensus).T

            if segment_consensus.shape != (0, ):
                matrix_consensus[segment] = segment_consensus
        self.signature_matrix_filtered = matrix_consensus
        self.relevant_segments = OrderedDict([
            (x[0], self.relevant_gn[self.schemes[0][0]][x[0]].keys())
            for x in self.signature_matrix_filtered.items()
        ])

        signature = OrderedDict([(x[0], []) for x in matrix_consensus.items()])
        for segment in self.relevant_segments:
            signature_map = self.signature_matrix_filtered[segment].argmax(
                axis=0)
            signature_map = self._assign_preferred_features(
                signature_map, segment, self.signature_matrix_filtered)
            tmp = np.array(self.signature_matrix_filtered[segment])

            for col, pos in enumerate(list(signature_map)):
                signature[segment].append([
                    list(AMINO_ACID_GROUPS.keys())[pos],
                    list(AMINO_ACID_GROUP_NAMES.values())[pos], tmp[pos][col],
                    int(tmp[pos][col] / 20) + 5
                ])
        self.signature_consensus = signature
Ejemplo n.º 4
0
    def find_relevant_gns(self):

        matrix_consensus = OrderedDict()
        for segment in self.segments:
            print(segment)
            segment_consensus = []
            signature_map = np.absolute(
                self.diff_matrix[segment]).argmax(axis=0)
            for col, pos in enumerate(list(signature_map)):
                if abs(self.diff_matrix[segment][pos][col]) > self.cutoff:
                    segment_consensus.append(self.diff_matrix[segment][:, col])
                    for scheme in self.schemes:
                        gnum = list(
                            self.common_gn[scheme[0]][segment].items())[col]
                        try:
                            self.relevant_gn[scheme[0]][segment][
                                gnum[0]] = gnum[1]
                        except:
                            self.relevant_gn[
                                scheme[0]][segment] = OrderedDict()
                            self.relevant_gn[scheme[0]][segment][
                                gnum[0]] = gnum[1]

            segment_consensus = np.array(segment_consensus).T
            if segment_consensus != []:
                matrix_consensus[segment] = segment_consensus
        self.signature_matrix_filtered = matrix_consensus
        self.relevant_segments = OrderedDict([
            (x[0], self.relevant_gn[self.schemes[0][0]][x[0]].keys())
            for x in self.signature_matrix_filtered.items()
        ])
        signature = OrderedDict([(x[0], []) for x in matrix_consensus.items()])
        for segment in self.relevant_segments:
            signature_map = np.absolute(
                self.signature_matrix_filtered[segment]).argmax(axis=0)
            tmp = np.array(self.signature_matrix_filtered[segment])
            for col, pos in enumerate(list(signature_map)):
                signature[segment].append([
                    list(AMINO_ACID_GROUPS.keys())[pos],
                    list(AMINO_ACID_GROUP_NAMES.values())[pos], tmp[pos][col],
                    int(tmp[pos][col] / 20) + 5
                ])
        self.signature_consensus = signature
Ejemplo n.º 5
0
    def __init__(self):

        self.aln_pos = Alignment()
        self.aln_neg = Alignment()

        self.features_normalized_pos = OrderedDict()
        self.features_normalized_neg = OrderedDict()
        self.features_frequency_difference = OrderedDict()
        self.features_frequency_diff_display = []

        self.freq_cutoff = 30
        self.common_gn = OrderedDict()

        self.feature_preference = prepare_aa_group_preference()
        self.group_lengths = dict([
            (x, len(y)) for x, y in enumerate(AMINO_ACID_GROUPS.values())
        ])
        self.default_column = np.array([
            ((y.startswith('-') or y == '_') and y != '--'
             and not y.startswith('-_') and 100) or 0
            for y in AMINO_ACID_GROUPS.keys()
        ])
Ejemplo n.º 6
0
    def calculate_signature(self):
        """
        Calculates the feature frequency difference between two protein sets.
        Generates the full differential matrix as well as maximum difference for a position (for scatter plot).
        """
        for sid, segment in enumerate(self.aln_neg.segments):
            self.features_normalized_pos[segment] = np.array(
                [[x[0] for x in feat[sid]]
                 for feat in self.aln_pos.feature_stats],
                dtype='int')
            self.features_normalized_neg[segment] = np.array(
                [[x[0] for x in feat[sid]]
                 for feat in self.aln_neg.feature_stats],
                dtype='int')

        for segment in self.aln_neg.segments:
            #TODO: get the correct default numering scheme from settings
            for idx, res in enumerate(
                    self.common_gn[self.common_schemes[0][0]][segment].keys()):
                if res not in self.aln_pos.generic_numbers[
                        self.common_schemes[0][0]][segment].keys():
                    self.features_normalized_pos[segment] = np.insert(
                        self.features_normalized_pos[segment], idx, 0, axis=1)
                    # Set 100% occurence for a gap feature
                    self.features_normalized_pos[segment][-1, idx] = 100
                elif res not in self.aln_neg.generic_numbers[
                        self.common_schemes[0][0]][segment].keys():
                    self.features_normalized_neg[segment] = np.insert(
                        self.features_normalized_neg[segment], idx, 0, axis=1)
                    # Set 100% occurence for a gap feature
                    self.features_normalized_neg[segment][-1, idx] = 100

            # now the difference
            self.features_frequency_difference[segment] = np.subtract(
                self.features_normalized_pos[segment],
                self.features_normalized_neg[segment])

        self._convert_feature_stats(self.features_normalized_pos, self.aln_pos)
        self._convert_feature_stats(self.features_normalized_neg, self.aln_neg)

        # Version with display data
        for row in range(len(AMINO_ACID_GROUPS.keys())):
            tmp_row = []
            for segment in self.aln_neg.segments:
                #first item is the real value,
                # second is the assignmnent of color (via css)
                # 0 - red, 5 - yellow, 10 - green
                #third item is a tooltip
                tmp_row.append([[
                    x,
                    int(x / 20) + 5, "{} - {}".format(
                        self.features_normalized_pos[segment][row][y],
                        self.features_normalized_neg[segment][row][y])
                ] for y, x in enumerate(
                    self.features_frequency_difference[segment][row])])
            self.features_frequency_diff_display.append(tmp_row)

        self.signature = OrderedDict([(x, []) for x in self.aln_neg.segments])
        for segment in self.aln_neg.segments:
            tmp = np.array(self.features_frequency_difference[segment])
            signature_map = np.absolute(tmp).argmax(axis=0)
            self.signature[segment] = []
            for col, pos in enumerate(list(signature_map)):
                self.signature[segment].append([
                    list(AMINO_ACID_GROUPS.keys())[pos],
                    list(AMINO_ACID_GROUP_NAMES.values())[pos],
                    self.features_frequency_difference[segment][pos][col],
                    int(self.features_frequency_difference[segment][pos][col] /
                        20) + 5
                ])

        features_pos = OrderedDict()
        features_neg = OrderedDict()
        self.features_consensus_pos = OrderedDict([
            (x, []) for x in self.aln_neg.segments
        ])
        self.features_consensus_neg = OrderedDict([
            (x, []) for x in self.aln_neg.segments
        ])
        for sid, segment in enumerate(self.aln_neg.segments):
            features_pos[segment] = np.array(
                [[x[0] for x in feat[sid]]
                 for feat in self.aln_pos.feature_stats],
                dtype='int')
            features_neg[segment] = np.array(
                [[x[0] for x in feat[sid]]
                 for feat in self.aln_neg.feature_stats],
                dtype='int')
            features_cons_pos = np.absolute(
                features_pos[segment]).argmax(axis=0)
            features_cons_neg = np.absolute(
                features_neg[segment]).argmax(axis=0)

            for col, pos in enumerate(list(features_cons_pos)):
                self.features_consensus_pos[segment].append([
                    list(AMINO_ACID_GROUPS.keys())[pos],
                    list(AMINO_ACID_GROUP_NAMES.values())[pos],
                    features_pos[segment][pos][col],
                    int(features_pos[segment][pos][col] / 20) + 5
                ])
            for col, pos in enumerate(list(features_cons_neg)):
                self.features_consensus_neg[segment].append([
                    list(AMINO_ACID_GROUPS.keys())[pos],
                    list(AMINO_ACID_GROUP_NAMES.values())[pos],
                    features_neg[segment][pos][col],
                    int(features_neg[segment][pos][col] / 20) + 5
                ])
        self._convert_feature_stats(self.features_normalized_pos, self.aln_pos)
        self._convert_feature_stats(self.features_normalized_neg, self.aln_neg)
Ejemplo n.º 7
0
    def score_protein(self, pcf):
        prot_score = 0.0
        norm = 0.0
        consensus_match = OrderedDict([(x, [])
                                       for x in self.relevant_segments])

        relevant_gns_total = []
        for segment in self.relevant_segments:
            for idx, pos in enumerate(
                    self.relevant_gn[self.schemes[0][0]][segment].keys()):
                relevant_gns_total.append(pos)

        resi = Residue.objects.filter(
            protein_conformation=pcf,
            generic_number__label__in=relevant_gns_total).prefetch_related(
                'generic_number')
        resi_dict = {}
        for r in resi:
            if r.generic_number:
                resi_dict[r.generic_number.label] = r

        for segment in self.relevant_segments:
            tmp = []
            signature_map = self.signature_matrix_filtered[segment].argmax(
                axis=0)
            signature_map = self._assign_preferred_features(
                signature_map, segment, self.signature_matrix_filtered)

            norm += np.sum(
                np.amax(self.signature_matrix_filtered[segment], axis=0))

            for idx, pos in enumerate(
                    self.relevant_gn[self.schemes[0][0]][segment].keys()):
                feat = signature_map[idx]
                feat_abr = list(AMINO_ACID_GROUPS.keys())[feat]
                feat_name = list(AMINO_ACID_GROUP_NAMES.values())[feat]
                val = self.signature_matrix_filtered[segment][feat][idx]
                if pos in resi_dict:
                    res = resi_dict[pos]
                    if feat in self.residue_to_feat[res.amino_acid]:
                        if val > 0:
                            prot_score += val
                        tmp.append([
                            feat_abr, feat_name, val, "green", res.amino_acid,
                            pos
                        ]) if val > 0 else tmp.append([
                            feat_abr, feat_name, val, "white", res.amino_acid,
                            pos
                        ])
                    else:
                        #David doesn't want the negative values in the score
                        # prot_score -= val
                        tmp.append([
                            feat_abr, feat_name, val, "red", res.amino_acid,
                            pos
                        ]) if val > 0 else tmp.append([
                            feat_abr, feat_name, val, "white", res.amino_acid,
                            pos
                        ])
                else:
                    if feat_name == 'Gap':
                        tmp.append([
                            feat_abr, feat_name, val, "green", '_', pos
                        ]) if val > 0 else tmp.append(
                            [feat_abr, feat_name, val, "white", '_', pos])
                        prot_score += val
                    else:
                        #David doesn't want the negative values in the score
                        #prot_score -= val
                        tmp.append([
                            feat_abr, feat_name, val, "red", '_', pos
                        ]) if val > 0 else tmp.append(
                            [feat_abr, feat_name, val, "white", '_', pos])
            consensus_match[segment] = tmp
        return (prot_score / 100, prot_score / norm * 100, consensus_match)