def _predict_proba(self, X): """ Функция, возвращающая для каждого объекта список (классы, вероятности) в порядке убывания вероятностей. Потом данная функция вызывается в predict и predict_proba. """ # оставляем только парадигмы, под которые может подходить слово answer = [None] * len(X) row_denser = ((lambda x: np.ravel(x.todense())) if self.sparse else (lambda x: x)) if self.has_letter_classifiers: data_indexes_by_letters = arrange_indexes_by_last_letters(X) for letter, indexes in data_indexes_by_letters.items(): curr_X = [X[i] for i in indexes] cls = self.letter_classifiers_.get(letter) if cls is None: if letter in self._default_letter_probs: probs_row = self._default_letter_probs[letter] else: probs_row = self.new_letter_probs curr_X_probs = np.tile(probs_row, (len(indexes), 1)) curr_classes = range(len(self.classes_)) active_classes_number = len(curr_classes) else: active_classes_number = cls.active_classes_number (train_indexes, X_train), (other_indexes, other_probs) =\ self._prepare_to_joint_classifier(cls, cls.classes_, curr_X, active_classes_number=active_classes_number) curr_X_probs = self.joint_classifiers_[letter].predict_proba(X_train) curr_classes = cls.classes_ # объекты, чьи классы встречались в обучающей выборке for i, (train_index, word_probs) in enumerate(zip(train_indexes, curr_X_probs)): index = indexes[train_index] if cls is None: row_ = self._fits_to_which_lemma_fragmentors(curr_X[i], negate=True) else: row = row_denser(X_train[i]) row_ = [j for j in range(active_classes_number) if row[3*j] == 0.0] indices, probs = self._extract_word_probs(word_probs, row_) indices = [curr_classes[j] for j in indices] answer[index] = (indices, probs) # объекты, чьи классы не встречались в обучающей выборке # их классы унаследованы от базового классификатора for other_index, (indices, probs) in zip(other_indexes, other_probs): index = indexes[other_index] indices = [curr_classes[j] for j in indices] answer[index] = (indices, probs) else: cls_classes = [i for i, _ in enumerate(self.classes_)] active_classes_number = self.paradigm_classifier.active_classes_number (train_indexes, X_train), (other_indexes, other_probs) =\ self._prepare_to_joint_classifier(self.paradigm_classifier, cls_classes, X, active_classes_number=active_classes_number) probs = self.joint_classifier.predict_proba(X_train) # объекты, чьи классы встречались в обучающей выборке for i, row, word_probs in zip(train_indexes, X_train, probs): # здесь надо разобраться row = row_denser(row) row_ = [j for j in range(active_classes_number) if row[3*j] == 0.0] answer[i] = self._extract_word_probs(word_probs, row_) # объекты, чьи классы не встречались в обучающей выборке # их классы унаследованы от базового классификатора for other_index, (indices, probs) in zip(other_indexes, other_probs): indices = [cls_classes[j] for j in indices] # print(X[other_index], self.paradigmers[indices[0]].descr) answer[other_index] = (indices, probs) # sys.exit() return answer
def fit(self, X, y, X_dev=None, y_dev=None): if len(X) != len(y): raise ValueError("Data and labels should have equal length") self._prepare_parameters() self._prepare_classifiers() if isinstance(self.nfeatures, float): if self.nfeatures < 0.0 or self.nfeatures > 1.0: raise ValueError("If nfeatures is float, it should be from 0.0 to 1.0") # выбираем, производится ли классификация отдельно для каждой буквы if self.has_letter_classifiers: self.classes_, Y_new = np.unique(y, return_inverse=True) # ДОБАВЛЯЕМ КЛАССЫ ИЗ paradigm_table classes_set, self.classes_ = set(self.classes_), list(self.classes_) for code in self.paradigms_list.values(): if code not in classes_set: self.classes_.append(code) self.classes_ = np.array(self.classes_) # ПЕРЕКОДИРУЕМ КЛАССЫ recoded_paradigm_table = {self.descrs_by_classes[label]: i for i, label in enumerate(self.classes_)} self.paradigm_classifier.set_params(paradigm_table=recoded_paradigm_table) data_indexes_by_letters =\ arrange_indexes_by_last_letters(X, [len(labels) for labels in y]) X_by_letters, y_by_letters = dict(), dict() single_class_letters = dict() for letter, indexes in data_indexes_by_letters.items(): X_curr, y_curr = [X[i] for i in indexes], [Y_new[i] for i in indexes] if min(y_curr) < max(y_curr): X_by_letters[letter] = X_curr y_by_letters[letter] = [[label] for label in y_curr] else: single_class_letters[letter] = y_curr[0] self.letter_classifiers_ = {letter: clone(self.paradigm_classifier) for letter in X_by_letters} self.joint_classifiers_ = {letter: clone(self.joint_classifier) for letter in X_by_letters} # определяем вероятности для букв, для которых нет классификаторов self._make_new_letter_probs(y) self._make_default_letter_probs(single_class_letters) for letter, X_curr in X_by_letters.items(): self.letter_classifiers_[letter].fit(X_curr, y_by_letters[letter]) else: self.paradigm_classifier.set_params(paradigm_table = self.paradigms_list) self.paradigm_classifier.fit(X, y) # self.paradigm_classifier уже содержал все классы, # поэтому ничего добавлять не нужно self.classes_ = self.paradigm_classifier.classes_ self.active_classes_number = self.paradigm_classifier.active_classes_number # обработчики парадигм self.paradigmers = [ParadigmSubstitutor(self.descrs_by_classes[label]) for label in self.classes_] # обработчики лемм self._prepare_lemma_fragmentors() # вероятности граммем self.form_probabilities_for_paradigms =\ [np.zeros(shape=(paradigmer.unique_forms_number(return_principal=False),), dtype=np.float64) for paradigmer in self.paradigmers] self.reverse_classes = {label: i for (i, label) in enumerate(self.classes_)} self._fit_probabilities(X, [[self.reverse_classes[code] for code in labels] for labels in y]) if self.has_letter_classifiers: for letter, X_curr in X_by_letters.items(): cls = self.letter_classifiers_[letter] cls_classes = cls.classes_[:cls.active_classes_number] (_, X_joint, y_joint), _ = self._prepare_to_joint_classifier( cls, cls_classes, X_curr, y_by_letters[letter]) self.joint_classifiers_[letter].fit(X_joint, y_joint) else: cls = self.paradigm_classifier cls_classes = list(range(cls.active_classes_number)) (_, X_joint, y_joint), _ =\ self._prepare_to_joint_classifier(cls, cls_classes, X, y) self.joint_classifier.fit(X_joint, y_joint) return self