コード例 #1
0
 def fit(self, data):
     if self.fit_lm:
         if self.lm_file is None:
             if self.save_lm_file is None:
                 raise ValueError(
                     "Either lm_file or save_lm_file should be given")
             save_language_model([list(x[2][0]) for x in data],
                                 self.lm_order, self.save_lm_file)
             self.lm_file = self.save_lm_file
         joint_classifier_params =\
             {'has_language_model': True, 'lm_file': self.lm_file,
              'has_joint_classifier': True, 'max_lm_coeff': 2.0}
     else:
         joint_classifier_params =\
             {'has_language_model': False, 'has_joint_classifier': False}
     self.affixes_remover.train(data)
     joint_data = self.make_paradigms_from_data(data)
     transformations_handler = TransformationsHandler(self.transform_codes)
     transformation_classifier_params = {
         'select_features': 'ambiguity',
         'selection_params': {
             'nfeatures': 0.25,
             'min_count': 2
         }
     }
     self.classifiers = [None] * self.problems_number
     classifier_params = get_classifier_params(self.language,
                                               self.task_type)
     # classifier_params['paradigm_table'] = self.transform_codes
     classifier_params['min_feature_count'] = 3
     classifier_params['nfeatures'] = 0.1
     for i, problem_descr in enumerate(self.problem_codes):
         curr_classifier_params = copy.copy(classifier_params)
         prefixes, suffixes = self.affixes_remover.get_affixes(
             problem_descr)
         curr_classifier_params['prefixes_to_remove'] = prefixes
         curr_classifier_params['suffixes_to_remove'] = suffixes
         self.classifiers[i] = JointParadigmClassifier(
             self.transform_codes, curr_classifier_params,
             transformations_handler, transformation_classifier_params,
             **joint_classifier_params)
     data_by_problems = arrange_data_by_problems(joint_data,
                                                 self.problems_number,
                                                 has_answers=True)
     for i, (_, curr_X, curr_y) in enumerate(data_by_problems):
         # if i != 87:
         #     continue
         if i % 20 == 0:
             print("Classifier {} fitting...".format(i + 1))
         self.classifiers[i].fit(curr_X, [[x] for x in curr_y])
     return self
コード例 #2
0
 def fit(self, data):
     joint_data = self.make_paradigms_from_data(data)
     transformations_handler = TransformationsHandler(self.transform_codes)
     transformation_classifier_params = {'select_features': 'ambiguity',
                                         'selection_params': {'nfeatures': 0.25, 'min_count': 2}}
     self.classifiers = [None] * self.problems_number
     # НЕМЕЦКИЙ
     # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': True,
     #                      'max_prefix_length': 2, 'suffixes_to_delete': ['en'],
     #                      'has_letter_classifiers': False}
     # ИСПАНСКИЙ
     # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': False}
     # АРАБСКИЙ
     # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': True,
     #                      'max_prefix_length': 4, 'max_length': 4,
     #                      'has_letter_classifiers': None, 'to_memorize_affixes': 0}
     # ГРУЗИНСКИЙ
     classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': True,
                          'max_prefix_length': 4, 'has_letter_classifiers': 'suffix',
                          'to_memorize_affixes': 0}
     # ФИНСКИЙ
     # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': False}
     # РУССКИЙ
     # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': False,
     #                      'max_prefix_length': 3, 'suffixes_to_delete': ['ся', 'сь'],
     #                      'to_memorize_affixes': 2, 'has_letter_classifiers': False}
     # НАВАХО
     # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': True,
     #                      'max_prefix_length': 5, 'max_length': 3, 'has_letter_classifiers': 'prefix',
     #                      'to_memorize_affixes': -3}
     # ТУРЕЦКИЙ
     # classifier_params = {'paradigm_table': self.transform_codes, 'use_prefixes': False}
     classifier_params['min_feature_count'] = 3
     classifier_params['nfeatures'] = 0.1
     for i in range(self.problems_number):
         self.classifiers[i] = JointParadigmClassifier(
             ParadigmClassifier(**classifier_params), transformations_handler,
             dict(), transformation_classifier_params)
     data_by_problems = arrange_data_by_problems(
         joint_data, self.problems_number, has_answers=True)
     for i, (_, curr_X, curr_y) in enumerate(data_by_problems):
         self.classifiers[i].fit(curr_X, [[x] for x in curr_y])
         # print("Classifier {0} of {1} fitted".format(i+1, self.problems_number))
     return self
コード例 #3
0
 def fit(self, data):
     if self.fit_lm:
         if self.lm_file is None:
             if self.save_lm_file is None:
                 raise ValueError(
                     "Either lm_file or save_lm_file should be given")
             save_language_model([list(x[0]) for x in data], self.lm_order,
                                 self.save_lm_file)
             self.lm_file = self.save_lm_file
         joint_classifier_params =\
             {'has_language_model': True, 'lm_file': self.lm_file,
              'has_joint_classifier': True, 'max_lm_coeff': 2.0}
     else:
         joint_classifier_params =\
             {'has_language_model': False, 'has_joint_classifier': False}
     self._initialize_affix_removal_params()
     reversed_data =\
         [((elem[2][0], elem[1], [elem[0]]) + elem[2:]) for elem in data]
     self.form_affix_remover.train(reversed_data)
     self.lemma_affix_remover.train(data)
     joint_data = self.make_paradigms_from_data(data)
     joint_reversed_data = self.make_paradigms_from_data(reversed_data)
     transformation_handler = TransformationsHandler(self.transform_codes)
     transformation_classifier_params = {
         'select_features': 'ambiguity',
         'selection_params': {
             'nfeatures': 0.25,
             'min_count': 2
         }
     }
     classifier_params = get_classifier_params(self.language)
     classifier_params['paradigm_table'] = self.transform_codes
     classifier_params['min_feature_count'] = 3
     classifier_params['nfeatures'] = 0.25
     reversed_classifier_params = copy.copy(classifier_params)
     reversed_classifier_params['max_length'] = 6
     if not reversed_classifier_params.get('use_prefixes', False):
         reversed_classifier_params['use_prefixes'] = True
         reversed_classifier_params['max_prefix_length'] = 3
     self.direct_classifiers = [None] * self.problems_number
     self.reversed_classifiers = [None] * self.problems_number
     data_by_problems = arrange_data_by_problems(joint_data,
                                                 self.problems_number,
                                                 has_answers=True)
     reversed_data_by_problems = arrange_data_by_problems(
         joint_reversed_data, self.problems_number, has_answers=True)
     for i, problem_descr in enumerate(self.problem_codes):
         # классификаторы лемма-словоформа
         direct_classifier_params = copy.copy(classifier_params)
         prefixes, suffixes =\
             self.lemma_affix_remover.get_affixes(problem_descr)
         direct_classifier_params['prefixes_to_remove'] = prefixes
         direct_classifier_params['suffixes_to_remove'] = suffixes
         self.direct_classifiers[i] = JointParadigmClassifier(
             self.transform_codes, direct_classifier_params,
             transformation_handler, transformation_classifier_params)
         _, curr_X, curr_y = data_by_problems[i]
         self.direct_classifiers[i].fit(curr_X,
                                        [[label] for label in curr_y])
         if i % 1 == 0:
             print("Classifiers {} fitted".format(i + 1))
         # классификаторы словоформа-лемма
         # удаляем суффиксы, здесь это особенно важно (???)
         curr_classifier_params = copy.copy(reversed_classifier_params)
         prefixes, suffixes =\
             self.form_affix_remover.get_affixes(problem_descr)
         curr_classifier_params['prefixes_to_remove'] = prefixes
         curr_classifier_params['suffixes_to_remove'] = suffixes
         self.reversed_classifiers[i] = JointParadigmClassifier(
             self.transform_codes, curr_classifier_params,
             transformation_handler, transformation_classifier_params,
             **joint_classifier_params)
         _, curr_X, curr_y = reversed_data_by_problems[i]
         self.reversed_classifiers[i].fit(curr_X,
                                          [[label] for label in curr_y])
         if i % 1 == 0:
             print("Classifiers {} fitted".format(i + 1))
     return self
コード例 #4
0
 def _initialize(self):
     self.transformations_handler = TransformationsHandler(
         self.paradigm_codes, self.paradigm_counts)
     if not os.path.exists(self.tmp_folder):
         os.makedirs(self.tmp_folder)
コード例 #5
0
class LMParadigmClassifier(BaseEstimator, ClassifierMixin):
    """
    Пытаемся классифицировать парадигмы с помощью языковых моделей
    """
    def __init__(self,
                 paradigm_codes,
                 paradigm_counts,
                 lm_order=3,
                 lm_type="kenlm",
                 multiclass=False,
                 tmp_folder="saved_models"):
        self.paradigm_codes = paradigm_codes
        self.paradigm_counts = paradigm_counts
        self.lm_order = lm_order
        self.lm_type = lm_type
        self.tmp_folder = tmp_folder
        self.multiclass = multiclass
        self.lm = None
        self.filename_count = 1
        self._initialize()

    def _initialize(self):
        self.transformations_handler = TransformationsHandler(
            self.paradigm_codes, self.paradigm_counts)
        if not os.path.exists(self.tmp_folder):
            os.makedirs(self.tmp_folder)

    def fit(self, X, y):
        lemmas_with_codes_and_vars = list(
            chain.from_iterable([(lemma, code, values)
                                 for code, values in label]
                                for lemma, label in zip(X, y)))
        strings_for_lm_learning = \
            self.transformations_handler._extract_transformations_for_lm_learning(
                lemmas_with_codes_and_vars)
        self.infile = os.path.join(
            self.tmp_folder,
            "saved_models_{0}.sav".format(self.filename_count))
        with open(self.infile, "w") as fout:
            for seq in strings_for_lm_learning:
                fout.write(" ".join(map(str, seq)) + "\n")
        self.outfile = os.path.join(
            self.tmp_folder,
            "saved_models_{0}.arpa".format(self.filename_count))
        with open(self.infile, "r") as fin, open(self.outfile, "w") as fout:
            subprocess.call([
                "/data/sorokin/tools/kenlm/bin/lmplz", "-o",
                str(self.lm_order), "-S", "4G"
            ],
                            stdin=fin,
                            stdout=fout)
        if self.lm_type == "pynlpl":
            self.lm = ARPALanguageModel(self.outfile, base_e=False)
        elif self.lm_type == "kenlm":
            self.lm = Model(self.outfile)
        return self

    @property
    def transformation_codes(self):
        return self.transformations_handler.transformation_codes

    @property
    def transformations_by_strings(self):
        return self.transformations_handler.transformations_by_strings

    @property
    def transformations(self):
        return self.transformations_handler.transformations

    def get_best_continuous_score(self, word):
        total_score = 0
        best_variant, best_score = None, -np.inf
        if self.lm_type == "kenlm":
            state = State()
            self.lm.BeginSentenceWrite(state)
        else:
            history = ('<s>', )
        for i, symbol in enumerate(word):
            prefix, suffix = word[:i], word[i:]
            for code in self.transformations_by_strings.get(suffix, []):
                if self.lm_type == "kenlm":
                    # curr_state изменяется в функции BaseScore, поэтому копируем
                    curr_state, out_state = copy.copy(state), State()
                    code_score = self.lm.BaseScore(curr_state, str(code),
                                                   out_state)
                    end_score = self.lm.EndSentenceBaseScore(
                        out_state, State())
                elif self.lm_type == "pynlpl":
                    code_score = self.lm.scoreword(str(code), history)
                    new_history = history + (code, )
                    end_score = self.lm.scoreword('</s>', new_history)
                score = (total_score + code_score + end_score)  # / (i + 1)
                # print("{3} {0} {1} {2:.2f}".format(
                #         " ".join(prefix), self.transformations[code], score, code))
                if score > best_score:
                    best_variant = list(prefix) + [code]
                    best_score = score
                    print("{3} {0} {1} {2:.2f}".format(
                        " ".join(prefix), self.transformations[code], score,
                        code))
            if self.lm_type == "kenlm":
                out_state = State()
                score = self.lm.BaseScore(state, symbol, out_state)
                state = out_state
            elif self.lm_type == "pynlpl":
                score = self.lm.scoreword(symbol, history)
                history += (symbol, )
            total_score += score
        curr_state, word_score = out_state, total_score
        for code in sorted(self.transformations_by_strings[""]):
            if self.lm_type == "kenlm":
                state, out_state = copy.copy(curr_state), State()
                code_score = self.lm.BaseScore(curr_state, str(code),
                                               out_state)
                end_score = self.lm.EndSentenceBaseScore(out_state, state)
            elif self.lm_type == "pynlpl":
                code_score = self.lm.scoreword(str(code), history)
                new_history = history + (str(code), )
                end_score = self.lm.scoreword('</s>', new_history)
            score = (total_score + code_score + end_score)  # / (len(word) + 1)
            # print("{0} {1} {2:.2f}".format(" ".join(word),
            #                                "#".join(self.transformations[code].trans), score))
            if score > best_score:
                best_variant, best_score = list(word) + [code], score
                print("{3} {0} {1} {2:.2f}".format(" ".join(word),
                                                   self.transformations[code],
                                                   score, code))
        answer = []
        for elem in best_variant:
            if isinstance(elem, str):
                answer.append([elem] * 13)
            else:
                answer.append(self.transformations[elem].trans)
        print("#".join("".join(elem) for elem in zip(*answer)))

    def test(self):
        """
        Тестируем интерфейс языковых моделей
        """
        word = "лыжня"
        self.get_best_continuous_score(word)
コード例 #6
0
def cv_mode(testing_mode, multiclass, predict_lemmas, find_flection,
            paradigm_file, infile, max_length, fraction, nfolds=0,
            selection_method=None, feature_fraction=None,
            output_train_dir=None, output_pred_dir=None):
    '''
    Определяет качество классификатора с заданными параметрами
    по скользящему контролю на обучающей выборке

    Параметры:
    -----------
    testing_mode: str ('predict' or  'predict_proba'), режим использования
    multiclass: bool, может ли одно слово иметь несколько парадигм
    find_flection:  bool, выполняется ли предварительный поиск флексии
        для того, чтобы использовать в качестве признаков суффиксы основы,
        а не всего слова. Оказалось, что качество ухудшается
    paradigm_file: str, путь к файлу с парадигмами
    infile: str, путь к файлу с обучающей выборкой
    fraction: float, доля обучающей выборки
    nfolds: int, optional(default=0)
        число разбиений, по которым производится усреднение при скользящем контроле
        nfolds=0 --- в обучающую выборку попадает соответствующее число лексем
                     из начала файла
    selection_method: str or None, optional (default=None),
        метод отбора признаков
    feature_fraction: float or None, optional (default=None),
        доля признаков, которые следует оставить при отборе признаков
        (при этом nfeatures должно быть не задано)
    output_train_dir: str or None, optional(default=None),
        директория для сохранения тестовых данных,
        в случае output_train_dir=None сохранение не производится
    output_pred_dir: str or None, optional(default=None),
        директория для сохранения результатов классификации,
        в случае output_train_dir=None сохранение не производится
    '''
    # чтение входных файлов
    lemma_descriptions_list = process_lemmas_file(infile)
    data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True)
    paradigm_descriptions_list = process_codes_file(paradigm_file)
    paradigm_table, pattern_counts = read_paradigms(paradigm_descriptions_list)
    if predict_lemmas:
        paradigm_handlers = {code: ParadigmSubstitutor(descr)
                             for descr, code in paradigm_table.items()}
    else:
        test_lemmas, pred_lemmas = None, None
    # подготовка для кросс-валидации
    classes = sorted(set(chain(*((x[0] for x in elem) for elem in labels_with_vars))))
    if selection_method is None:
        selection_method = 'ambiguity'
    if nfolds == 0:
        train_data_length = int(fraction * len(data))
        train_data, test_data = [data[:train_data_length]], [data[train_data_length:]]
        train_labels_with_vars, test_labels_with_vars =\
            [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]]
        nfolds = 1
    else:
        test_data, train_data = [None] * nfolds, [None] * nfolds
        test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [None] * nfolds
        for fold in range(nfolds):
            train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\
                skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - fraction,
                                      random_state = 100 * fold + 13)
        if predict_lemmas:
            test_lemmas = [None] * nfolds
            for fold in range(nfolds):
                test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold])
    predictions = [None] * nfolds
    prediction_probs = [None] * nfolds
    classes_by_cls = [None] * nfolds
    if predict_lemmas:
        pred_lemmas = [None] * nfolds
    # задаём классификатор
    paradigm_classifier = ParadigmClassifier(paradigm_table)
    paradigm_classifier_params = {'multiclass': multiclass, 'find_flection': find_flection,
                                  'max_length': max_length, 'use_prefixes': True,
                                  'classifier_params': None, 'selection_method': selection_method,
                                  'nfeatures': feature_fraction, 'smallest_prob': 0.01}
    transformation_handler = TransformationsHandler(paradigm_table, pattern_counts)
    transformation_classifier_params = {'select_features': 'ambiguity',
                                        'selection_params': {'nfeatures': 0.1, 'min_count': 2}}
    # statprof.start()
    cls = JointParadigmClassifier(paradigm_table, paradigm_classifier_params,
                                  transformation_handler, transformation_classifier_params)
    # cls = CombinedParadigmClassifier(paradigm_classifier, transformation_handler,
    #                                  paradigm_classifier_params, transformation_classifier_params)
    # сохраняем тестовые данные
    # if output_train_dir is not None:
    #     if not os.path.exists(output_train_dir):
    #         os.makedirs(output_train_dir)
    #     for i, (train_sample, train_labels_sample) in\
    #             enumerate(zip(train_data, train_labels_with_vars), 1):
    #         write_joint_data(os.path.join(output_train_dir, "{0}.data".format(i)),
    #                          train_sample, train_labels_sample)
    # применяем классификатор к данным
    for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\
            enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)):
        cls.fit(train_sample, train_labels_sample)
        classes_by_cls[i] = cls.classes_
        if testing_mode == 'predict':
            predictions[i] = cls.predict(test_sample)
        elif testing_mode == 'predict_proba':
            prediction_probs[i] = cls.predict_probs(test_sample)
            # в случае, если мы вернули вероятности,
            # то надо ещё извлечь классы
            if not multiclass:
                predictions[i] = [[elem[0][0]] for elem in prediction_probs[i]]
            else:
                raise NotImplementedError()
        if predict_lemmas:
            pred_lemmas[i] = make_lemmas(paradigm_handlers, predictions[i])
    descrs_by_codes = {code: descr for descr, code in paradigm_table.items()}
    if output_pred_dir:
        test_words = test_data
        if testing_mode == 'predict_proba':
            prediction_probs_for_output = prediction_probs
        else:
            prediction_probs_for_output = None
    else:
        test_words, prediction_probs_for_output = None, None
    if not predict_lemmas:
        label_precisions, variable_precisions, form_precisions =\
            output_accuracies(classes, test_labels_with_vars, predictions, multiclass,
                              outfile=output_pred_dir, paradigm_descrs=descrs_by_codes,
                              test_words=test_words, predicted_probs=prediction_probs_for_output)
        print("{0}\t{1:<.2f}\t{2}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}".format(
            max_length, fraction, cls.paradigm_classifier.nfeatures,
            100 * np.mean(label_precisions), 100 * np.mean(variable_precisions),
            100 * np.mean(form_precisions)))
    else:
        label_precisions, variable_precisions, lemma_precisions, form_precisions =\
            output_accuracies(classes, test_labels_with_vars, predictions,
                              multiclass, test_lemmas, pred_lemmas,
                              outfile=output_pred_dir, paradigm_descrs=descrs_by_codes,
                              test_words=test_words, predicted_probs=prediction_probs_for_output,
                              save_confusion_matrices=True)
        print("{0}\t{1:<.2f}\t{2}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}\t{6:<.2f}".format(
            max_length, fraction, cls.paradigm_classifier.nfeatures,
            100 * np.mean(label_precisions), 100 * np.mean(variable_precisions),
            100 * np.mean(lemma_precisions), 100 * np.mean(form_precisions)))
    # statprof.stop()
    # with open("statprof_{0:.1f}_{1:.1f}.stat".format(fraction, feature_fraction), "w") as fout:
    #     with redirect_stdout(fout):
    #         statprof.display()
    # вычисляем точность и обрабатываем результаты
    # for curr_test_values, curr_pred_values in zip(test_values_with_codes, pred_values_with_codes):
    #     print(len(curr_test_values), len(curr_pred_values))
    #     for first, second in zip(curr_test_values, curr_pred_values):
    #         first_code, first_vars = first[0].split('_')[0], tuple(first[0].split('_')[1:])
    #         second_code, second_vars = second[0].split('_')[0], tuple(second[0].split('_')[1:])
    #         if first_code == second_code and first_vars != second_vars:
    #             print('{0}\t{1}'.format(first, second))
    # if not multiclass:
    #     confusion_matrices = [skm.confusion_matrix(first, second, labels=classes)
    #                           for first, second in zip(firsts, seconds)]
    # сохраняем результаты классификации
    # if output_pred_dir is not None:
    #     if not os.path.exists(output_pred_dir):
    #         os.makedirs(output_pred_dir)
    #     if testing_mode == 'predict':
    #         for i, (test_sample, pred_labels_sample, true_labels_sample) in\
    #                 enumerate(zip(test_data, predictions, test_labels), 1):
    #             write_data(os.path.join(output_pred_dir, "{0}.data".format(i)),
    #                        test_sample, pred_labels_sample, true_labels_sample)
    #     elif testing_mode == 'predict_proba':
    #         for i, (test_sample, pred_probs_sample, labels_sample, cls_classes) in\
    #                 enumerate(zip(test_data, prediction_probs, test_labels, classes_by_cls), 1):
    #             write_probs_data(os.path.join(output_pred_dir, "{0}.prob".format(i)),
    #                              test_sample, pred_probs_sample, cls_classes, labels_sample)
    # сохраняем матрицы ошибок классификации
    # if not multiclass and nfolds <= 1:
    #     confusion_matrices_folder = "confusion_matrices"
    #     if not os.path.exists(confusion_matrices_folder):
    #         os.makedirs(confusion_matrices_folder)
    #     dest = os.path.join(confusion_matrices_folder,
    #                         "confusion_matrix_{0}_{1:<.2f}_{2:<.2f}.out".format(
    #                             max_length, fraction, feature_fraction))
    #     with open(dest, "w", encoding="utf8") as fout:
    #         fout.write("{0:<4}".format("") +
    #                    "".join("{0:>4}".format(label) for label in classes) + "\n")
    #         for label, elem in zip(cls.classes_, confusion_matrices[0]):
    #             nonzero_positions = np.nonzero(elem)
    #             nonzero_counts = np.take(elem, nonzero_positions)[0]
    #             nonzero_labels = np.take(classes, nonzero_positions)[0]
    #             fout.write("{0:<4}\t".format(label))
    #             fout.write("\t".join("{0}:{1}".format(*pair)
    #                                  for pair in sorted(zip(nonzero_labels, nonzero_counts),
    #                                                     key=(lambda x: x[1]), reverse=True))
    #                        + "\n")
    return
コード例 #7
0
def cv_mode(testing_mode, language_code, multiclass, predict_lemmas,
            paradigm_file, counts_file, infile,
            train_fraction, feature_fraction, paradigm_counts_threshold,
            nfolds, selection_method, binarization_method, max_feature_length,
            output_train_dir=None, output_pred_dir=None):
    lemma_descriptions_list = process_lemmas_file(infile)
    data, labels_with_vars = read_lemmas(lemma_descriptions_list, multiclass=multiclass, return_joint=True)
    paradigm_descriptions_list = process_codes_file(paradigm_file)
    paradigm_table, paradigm_counts = read_paradigms(paradigm_descriptions_list)
    word_counts_table = read_counts(counts_file)
    if predict_lemmas:
        paradigm_handlers = {code: ParadigmSubstitutor(descr)
                             for descr, code in paradigm_table.items()}
    else:
        test_lemmas, pred_lemmas = None, None
    # подготовка для кросс-валидации
    classes = sorted(set(chain(*((x[0] for x in elem) for elem in labels_with_vars))))
    active_paradigm_codes = [i for i, count in paradigm_counts.items()
                             if count >= paradigm_counts_threshold]
    # зачем нужны метки???
    marks = [LEMMA_KEY] + [",".join(x) for x in get_categories_marks(language_code)]
    paradigms_by_codes = {code: descr for descr, code in paradigm_table.items()}
    # подготовка данных для локальных трансформаций
    if selection_method is None:
        selection_method = 'ambiguity'
    if nfolds == 0:
        train_data_length = int(train_fraction * len(data))
        train_data, test_data = [data[:train_data_length]], [data[train_data_length:]]
        train_labels_with_vars, test_labels_with_vars =\
            [labels_with_vars[:train_data_length]], [labels_with_vars[train_data_length:]]
        nfolds = 1
    else:
        test_data, train_data = [None] * nfolds, [None] * nfolds
        test_labels_with_vars, train_labels_with_vars = [None] * nfolds, [None] * nfolds
        for fold in range(nfolds):
            train_data[fold], test_data[fold], train_labels_with_vars[fold], test_labels_with_vars[fold] =\
                skcv.train_test_split(data, labels_with_vars, test_size = 1.0 - train_fraction,
                                      random_state = 100 * fold + 13)
        if predict_lemmas:
            test_lemmas = [None] * nfolds
            for fold in range(nfolds):
                test_lemmas[fold] = make_lemmas(paradigm_handlers, test_labels_with_vars[fold])
    predictions = [None] * nfolds
    prediction_probs = [None] * nfolds
    classes_by_cls = [None] * nfolds
    if predict_lemmas:
        pred_lemmas = [None] * nfolds
    # задаём классификатор
    # cls = ParadigmCorporaClassifier(marks, paradigm_table, word_counts_table,
    #                                 multiclass=multiclass, selection_method=selection_method,
    #                                 binarization_method=binarization_method,
    #                                 inner_feature_fraction=feature_fraction,
    #                                 active_paradigm_codes=active_paradigm_codes,
    #                                 paradigm_counts=paradigm_counts , smallest_prob=0.01)
    cls = ParadigmCorporaClassifier(paradigm_table, word_counts_table,
                                    multiclass=multiclass, selection_method=selection_method,
                                    binarization_method=binarization_method,
                                    inner_feature_fraction=feature_fraction,
                                    active_paradigm_codes=active_paradigm_codes,
                                    paradigm_counts=paradigm_counts , smallest_prob=0.001)
    cls_params = {'max_length': max_feature_length}
    transformation_handler = TransformationsHandler(paradigm_table, paradigm_counts)
    transformation_classifier_params = {'select_features': 'ambiguity',
                                        'selection_params': {'nfeatures': 0.1, 'min_count': 2}}
    # statprof.start()
    cls = JointParadigmClassifier(cls, transformation_handler, cls_params,
                                  transformation_classifier_params)
    # cls = CombinedParadigmClassifier(cls, transformation_handler, cls_params,
    #                                  transformation_classifier_params)
    # сохраняем тестовые данные
    # if output_train_dir is not None:
    #     if not os.path.exists(output_train_dir):
    #         os.makedirs(output_train_dir)
    #     for i, (train_sample, train_labels_sample) in\
    #             enumerate(zip(train_data, train_labels_with_vars), 1):
    #         write_joint_data(os.path.join(output_train_dir, "{0}.data".format(i)),
    #                          train_sample, train_labels_sample)
    # применяем классификатор к данным
    for i, (train_sample, train_labels_sample, test_sample, test_labels_sample) in\
            enumerate(zip(train_data, train_labels_with_vars, test_data, test_labels_with_vars)):
        cls.fit(train_sample, train_labels_sample)
        classes_by_cls[i] = cls.classes_
        if testing_mode == 'predict':
            predictions[i] = cls.predict(test_sample)
        elif testing_mode == 'predict_proba':
            prediction_probs[i] = cls.predict_probs(test_sample)
            # в случае, если мы вернули вероятности,
            # то надо ещё извлечь классы
            if not multiclass:
                predictions[i] = [[elem[0][0]] for elem in prediction_probs[i]]
            else:
                raise NotImplementedError()
        if predict_lemmas:
            pred_lemmas[i] = make_lemmas(paradigm_handlers, predictions[i])
    # statprof.stop()
    # with open("statprof_{0:.1f}_{1:.1f}.stat".format(train_fraction,
    #                                                  feature_fraction), "w") as fout:
    #     with redirect_stdout(fout):
    #         statprof.display()
    if output_pred_dir:
        descrs_by_codes = {code: descr for descr, code in paradigm_table.items()}
        test_words = test_data
        if testing_mode == 'predict_proba':
            prediction_probs_for_output = prediction_probs
        else:
            prediction_probs_for_output = None
    else:
        descrs_by_codes, test_words, prediction_probs_for_output = None, None, None
    if not predict_lemmas:
        label_precisions, variable_precisions, form_precisions =\
            output_accuracies(classes, test_labels_with_vars, predictions, multiclass,
                              outfile=output_pred_dir, paradigm_descrs=descrs_by_codes,
                              test_words=test_words, predicted_probs=prediction_probs_for_output,
                              save_confusion_matrices=True)
        print("{0:<.2f}\t{1}\t{2:<.2f}\t{3:<.2f}\t{4:<.2f}".format(
            train_fraction, cls.paradigm_classifier.nfeatures,
            100 * np.mean(label_precisions), 100 * np.mean(variable_precisions),
            100 * np.mean(form_precisions)))
    else:
        label_precisions, variable_precisions, lemma_precisions, form_precisions =\
            output_accuracies(classes, test_labels_with_vars, predictions,
                              multiclass, test_lemmas, pred_lemmas,
                              outfile=output_pred_dir, paradigm_descrs=descrs_by_codes,
                              test_words=test_words, predicted_probs=prediction_probs_for_output,
                              save_confusion_matrices=True)
        print("{0:<.2f}\t{1}\t{2:<.2f}\t{3:<.2f}\t{4:<.2f}\t{5:<.2f}".format(
            train_fraction, cls.paradigm_classifier.nfeatures,
            100 * np.mean(label_precisions), 100 * np.mean(variable_precisions),
            100 * np.mean(lemma_precisions), 100 * np.mean(form_precisions)))

    # statprof.stop()
    # with open("statprof_{0:.1f}_{1:.1f}.stat".format(fraction, feature_fraction), "w") as fout:
    #     with redirect_stdout(fout):
    #         statprof.display()
    # вычисляем точность и обрабатываем результаты
    # for curr_test_values, curr_pred_values in zip(test_values_with_codes, pred_values_with_codes):
    #     for first, second in zip(curr_test_values, curr_pred_values):
    #         first_code, first_vars = first[0].split('_')[0], tuple(first[0].split('_')[1:])
    #         second_code, second_vars = second[0].split('_')[0], tuple(second[0].split('_')[1:])
    #         if first_code == second_code and first_vars != second_vars:
    #             print('{0}\t{1}'.format(first, second))
    # if not multiclass:
    #     confusion_matrices = [skm.confusion_matrix(first, second, labels=classes)
    #                           for first, second in zip(firsts, seconds)]
    # сохраняем результаты классификации
    # if output_pred_dir is not None:
    #     if not os.path.exists(output_pred_dir):
    #         os.makedirs(output_pred_dir)
    #     if testing_mode == 'predict':
    #         for i, (test_sample, pred_labels_sample, true_labels_sample) in\
    #                 enumerate(zip(test_data, predictions, test_labels), 1):
    #             write_data(os.path.join(output_pred_dir, "{0}.data".format(i)),
    #                        test_sample, pred_labels_sample, true_labels_sample)
    #     elif testing_mode == 'predict_proba':
    #         for i, (test_sample, pred_probs_sample, labels_sample, cls_classes) in\
    #                 enumerate(zip(test_data, prediction_probs, test_labels, classes_by_cls), 1):
    #             write_probs_data(os.path.join(output_pred_dir, "{0}.prob".format(i)),
    #                              test_sample, pred_probs_sample, cls_classes, labels_sample)
    # сохраняем матрицы ошибок классификации
    # if not multiclass and nfolds <= 1:
    #     confusion_matrices_folder = "confusion_matrices"
    #     if not os.path.exists(confusion_matrices_folder):
    #         os.makedirs(confusion_matrices_folder)
    #     dest = os.path.join(confusion_matrices_folder,
    #                         "confusion_matrix_{0}_{1:<.2f}_{2:<.2f}.out".format(
    #                             max_length, fraction, feature_fraction))
    #     with open(dest, "w", encoding="utf8") as fout:
    #         fout.write("{0:<4}".format("") +
    #                    "".join("{0:>4}".format(label) for label in classes) + "\n")
    #         for label, elem in zip(cls.classes_, confusion_matrices[0]):
    #             nonzero_positions = np.nonzero(elem)
    #             nonzero_counts = np.take(elem, nonzero_positions)[0]
    #             nonzero_labels = np.take(classes, nonzero_positions)[0]
    #             fout.write("{0:<4}\t".format(label))
    #             fout.write("\t".join("{0}:{1}".format(*pair)
    #                                  for pair in sorted(zip(nonzero_labels, nonzero_counts),
    #                                                     key=(lambda x: x[1]), reverse=True))
    #                        + "\n")
    return
コード例 #8
0
 def _initialize(self):
     self.transformations_handler = TransformationsHandler(self.paradigm_codes,
                                                           self.paradigm_counts)
     if not os.path.exists(self.tmp_folder):
         os.makedirs(self.tmp_folder)
コード例 #9
0
class LMParadigmClassifier(BaseEstimator, ClassifierMixin):
    """
    Пытаемся классифицировать парадигмы с помощью языковых моделей
    """
    def __init__(self, paradigm_codes, paradigm_counts, lm_order=3,
                 lm_type="kenlm", multiclass=False, tmp_folder="saved_models"):
        self.paradigm_codes = paradigm_codes
        self.paradigm_counts = paradigm_counts
        self.lm_order = lm_order
        self.lm_type = lm_type
        self.tmp_folder = tmp_folder
        self.multiclass = multiclass
        self.lm = None
        self.filename_count = 1
        self._initialize()

    def _initialize(self):
        self.transformations_handler = TransformationsHandler(self.paradigm_codes,
                                                              self.paradigm_counts)
        if not os.path.exists(self.tmp_folder):
            os.makedirs(self.tmp_folder)


    def fit(self, X, y):
        lemmas_with_codes_and_vars = list(chain.from_iterable(
            [(lemma, code, values) for code, values in label]
            for lemma, label in zip(X, y)))
        strings_for_lm_learning = \
            self.transformations_handler._extract_transformations_for_lm_learning(
                lemmas_with_codes_and_vars)
        self.infile = os.path.join(self.tmp_folder,
                                   "saved_models_{0}.sav".format(self.filename_count))
        with open(self.infile, "w") as fout:
            for seq in strings_for_lm_learning:
                fout.write(" ".join(map(str, seq)) + "\n")
        self.outfile = os.path.join(self.tmp_folder,
                                    "saved_models_{0}.arpa".format(self.filename_count))
        with open(self.infile, "r") as fin, open(self.outfile, "w") as fout:
            subprocess.call(["/data/sorokin/tools/kenlm/bin/lmplz",
                             "-o", str(self.lm_order), "-S", "4G"], stdin=fin, stdout=fout)
        if self.lm_type == "pynlpl":
            self.lm = ARPALanguageModel(self.outfile, base_e=False)
        elif self.lm_type == "kenlm":
            self.lm = Model(self.outfile)
        return self

    @property
    def transformation_codes(self):
        return self.transformations_handler.transformation_codes

    @property
    def transformations_by_strings(self):
        return self.transformations_handler.transformations_by_strings

    @property
    def transformations(self):
        return self.transformations_handler.transformations

    def get_best_continuous_score(self, word):
        total_score = 0
        best_variant, best_score = None, -np.inf
        if self.lm_type == "kenlm":
            state = State()
            self.lm.BeginSentenceWrite(state)
        else:
            history = ('<s>',)
        for i, symbol in enumerate(word):
            prefix, suffix = word[:i], word[i:]
            for code in self.transformations_by_strings.get(suffix, []):
                if self.lm_type == "kenlm":
                    # curr_state изменяется в функции BaseScore, поэтому копируем
                    curr_state, out_state = copy.copy(state), State()
                    code_score = self.lm.BaseScore(curr_state, str(code), out_state)
                    end_score = self.lm.EndSentenceBaseScore(out_state, State())
                elif self.lm_type == "pynlpl":
                    code_score = self.lm.scoreword(str(code), history)
                    new_history = history + (code,)
                    end_score = self.lm.scoreword('</s>', new_history)
                score = (total_score + code_score + end_score) # / (i + 1)
                print("{3} {0} {1} {2:.2f}".format(
                        " ".join(prefix), self.transformations[code], score, code))
                if score > best_score:
                    best_variant = list(prefix) + [code]
                    best_score = score
                    # print("{3} {0} {1} {2:.2f}".format(
                    #     " ".join(prefix), self.transformations[code], score, code))
            if self.lm_type == "kenlm":
                out_state = State()
                score = self.lm.BaseScore(state, symbol, out_state)
                state = out_state
            elif self.lm_type == "pynlpl":
                score = self.lm.scoreword(symbol, history)
                history += (symbol,)
            total_score += score
        curr_state, word_score = out_state, total_score
        for code in sorted(self.transformations_by_strings[""]):
            if self.lm_type == "kenlm":
                state, out_state = copy.copy(curr_state), State()
                code_score = self.lm.BaseScore(curr_state, str(code), out_state)
                end_score = self.lm.EndSentenceBaseScore(out_state, state)
            elif self.lm_type == "pynlpl":
                code_score = self.lm.scoreword(str(code), history)
                new_history = history + (str(code), )
                end_score = self.lm.scoreword('</s>', new_history)
            score = (total_score + code_score + end_score) # / (len(word) + 1)
            print("{0} {1} {2:.2f}".format(" ".join(word),
                                           "#".join(self.transformations[code].trans), score))
            if score > best_score:
                best_variant, best_score = list(word) + [code], score
                # print("{3} {0} {1} {2:.2f}".format(
                #         " ".join(word), self.transformations[code], score, code))
        answer = []
        for elem in best_variant:
            if isinstance(elem, str):
                answer.append([elem] * 13)
            else:
                answer.append(self.transformations[elem].trans)
        print("#".join("".join(elem) for elem in zip(*answer)))


    def test(self):
        """
        Тестируем интерфейс языковых моделей
        """
        word = "лыжня"
        self.get_best_continuous_score(word)