예제 #1
0
    def train_model(
        cls,
        log: ProcessLogger,
        field: DocumentField,
        train_data_sets: List[List[dict]],
        split_and_log_out_of_sample_test_report: bool = False
    ) -> ClassifierModel:
        typed_field = TypedField.by(field)
        df = pd.DataFrame.from_records(train_data_sets.pop(0))
        # add transferred external data
        for train_data in train_data_sets:
            df = df.append(pd.DataFrame.from_records(train_data))

        df['target_name'] = df.apply(lambda row: encode_category(
            field.code, row.value
            if typed_field.is_choice_field else None, row.extraction_hint),
                                     axis=1)

        df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1

        df = df.append([{
            'text_unit__textunittext__text': i
        }
                        for i in cls.get_no_field_text_units(
                            field.document_type, field.text_unit_type)])

        df['target_index'] = df['target_index'].fillna(0).astype('int')
        df['target_name'] = df['target_name'].fillna(
            SkLearnClassifierModel.EMPTY_CAT_NAME).astype('str')
        df['user_input'] = df['modified_by'].fillna(0).astype('bool')

        res_df = pd.DataFrame()

        for group_index, group_df in df.groupby('target_index'):
            if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN:
                group_df = shuffle(
                    group_df.sort_values('user_input', ascending=False)
                    [:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
            res_df = res_df.append(group_df)
        res_df = shuffle(res_df)

        target_names = sorted(res_df['target_name'].unique())

        if field.classifier_init_script:
            try:
                clf = cls.init_classifier(field)
            except Exception as e:
                log.error(
                    f'Unable to initialize classifier for field {field.code}. '
                    f'Classifier init script: {field.classifier_init_script}',
                    exc_info=e)
        else:
            clf = SGDClassifier(loss='hinge',
                                penalty='l2',
                                alpha=1e-3,
                                max_iter=5,
                                tol=None,
                                n_jobs=-1,
                                class_weight='balanced')

        log.info(f'Classifier initialized: {clf}')

        text_clf = Pipeline([
            ('vect',
             CountVectorizer(strip_accents='unicode',
                             analyzer='word',
                             stop_words='english',
                             tokenizer=word_position_tokenizer)),
            ('tfidf', TfidfTransformer()),
            ('clf', clf),
        ])
        x = res_df['text_unit__textunittext__text']
        y = res_df['target_index']

        if split_and_log_out_of_sample_test_report:
            x_train, x_test_os, y_train, y_test_os = train_test_split(
                x, y, test_size=0.2, random_state=42)
        else:
            x_train, x_test_os, y_train, y_test_os = x, None, y, None

        sklearn_model = text_clf.fit(x_train, y_train)

        model = SkLearnClassifierModel(sklearn_model=sklearn_model,
                                       target_names=target_names)

        classifier_model = ClassifierModel()
        classifier_model.set_trained_model_obj(model)
        classifier_model.document_field = field

        classifier_model.classifier_accuracy_report_in_sample = \
            classification_report(y,
                                  text_clf.predict(x),
                                  target_names=target_names)

        if y_test_os is not None and x_test_os is not None:
            classifier_model.classifier_accuracy_report_out_of_sample = \
                classification_report(y_test_os,
                                      text_clf.predict(x_test_os),
                                      target_names=target_names)

        return classifier_model
    def train_document_field_detector_model(cls,
                                            log: ProcessLogger,
                                            field: DocumentField,
                                            train_data_project_ids: Optional[List],
                                            use_only_confirmed_field_values: bool = False,
                                            train_documents: Iterable[Document] = None) \
            -> Optional[ClassifierModel]:

        field_type_adapter = FIELD_TYPES_REGISTRY[
            field.type]  # type: FieldType

        log.set_progress_steps_number(7)
        log.info('Training model for field #{0} ({1})...'.format(
            field.pk, field.code))

        # Classifier: values of dependencies -> value of this field
        # Field types supported: only choice fields
        if not isinstance(field_type_adapter, ChoiceField):
            raise ValueError(
                'Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}'
                .format(field.code, field.uid, field_type_adapter.code))
        # Lets find good values of depends-on fields suitable for using as train data.

        if train_documents:
            train_data = list(train_documents.values_list('field_values', flat=True)) \
                if hasattr(train_documents, 'values_list') \
                else [doc.field_values for doc in train_documents]
        elif train_data_project_ids and not use_only_confirmed_field_values:
            train_data = list(
                Document.objects.filter(project_id__in=train_data_project_ids).
                order_by('id').values_list(
                    'field_values',
                    flat=True)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
        else:
            train_data = list(cls.get_user_data(field, train_data_project_ids))

        if not train_data:
            raise RuntimeError(
                'Not enough train data for field {0} (#{1}). '
                'Need at least {2} approved or changed documents of type {3}.'.
                format(field.code, field.uid,
                       settings.ML_TRAIN_DATA_SET_GROUP_LEN,
                       field.document_type.code))

        depends_on_fields_types = cls.get_depends_on_uid_code_type(field)
        depends_on_fields_types = cls.remove_empty_fields(
            depends_on_fields_types, train_data)

        pipeline, feature_names_funcs = cls.build_pipeline(
            field, depends_on_fields_types)  # type: Pipeline, List[Callable]

        categories = cls.get_categories(field)
        category_names_to_indexes = {c: i for i, c in enumerate(categories)}

        log.step_progress()
        log.info(
            'Collecting feature rows from train and test documents in dict form...'
        )

        #  When tried to use sklearn shuffling something went wrong, leaving manual methods for a while.
        random.shuffle(train_data)

        #  TODO: use sklearn methods for splitting train/test data and shuffling

        test_size = 0.2

        train_feature_data = list()
        train_target_data = list()

        for doc_field_values in train_data:
            field_value = doc_field_values.get(field.uid)
            del doc_field_values[field.uid]

            field_value_idx = category_names_to_indexes.get(
                field_value) if field_value else None

            if field_value_idx is None:
                field_value_idx = len(categories)

            train_feature_data.append(doc_field_values)
            train_target_data.append(field_value_idx)

        is_index = math.floor(test_size * len(train_data))

        test_oos_feature_data = train_feature_data[:is_index]
        test_oos_target_data = train_target_data[:is_index]

        train_feature_data = train_feature_data[is_index:]
        train_target_data = train_target_data[is_index:]

        test_is_feature_data = train_feature_data  # [:is_index]
        test_is_target_data = train_target_data  # [:is_index]

        log.step_progress()
        log.info('Training the model...')
        model = pipeline.fit(train_feature_data, train_target_data)

        log.step_progress()

        log.info('Testing the model...')
        cm = ClassifierModel()
        cm.document_field = field

        predicted_oos = pipeline.predict(test_oos_feature_data)
        cm.classifier_accuracy_report_out_of_sample = classification_report(
            test_oos_target_data, predicted_oos, target_names=categories)

        predicted_is = pipeline.predict(test_is_feature_data)
        cm.classifier_accuracy_report_in_sample = classification_report(
            test_is_target_data, predicted_is, target_names=categories)

        log.step_progress()
        log.info('Saving ClassifierModel instance...')

        feature_names = []
        for f in feature_names_funcs:
            feature_names.extend(f())

        cm.set_trained_model_obj({
            'model': model,
            'categories': categories,
            'feature_names': feature_names
        })
        log.step_progress()
        log.info('Finished.')
        return cm
def train_model(field: DocumentField, train_data_sets: List[dict]) -> ClassifierModel:
    df = pd.DataFrame.from_records(train_data_sets.pop(0))
    # add transferred external data
    for train_data in train_data_sets:
        df = df.append(pd.DataFrame.from_records(train_data))

    df['target_name'] = df.apply(lambda row: encode_category(
        field.pk,
        row.value if field.is_choice_field() else None,
        row.extraction_hint), axis=1)

    df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1

    df = df.append([{'text_unit__text': i} for i in get_no_field_text_units(field.document_type, field.text_unit_type)])

    df['target_index'] = df['target_index'].fillna(0).astype('int')
    df['target_name'] = df['target_name'].fillna(SkLearnClassifierModel.EMPTY_CAT_NAME).astype(
        'str')
    df['user_input'] = df['created_by'].fillna(0).astype('bool')

    res_df = pd.DataFrame()

    for group_index, group_df in df.groupby('target_index'):
        if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN:
            group_df = shuffle(
                group_df.sort_values('user_input', ascending=False)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
        res_df = res_df.append(group_df)
    res_df = shuffle(res_df)

    target_names = sorted(res_df['target_name'].unique())

    text_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', analyzer='word',
                                                  stop_words='english',
                                                  tokenizer=word_position_tokenizer)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, max_iter=5, tol=None, n_jobs=-1,
                                               class_weight='balanced')),
                         ])
    x = res_df['text_unit__text']
    y = res_df['target_index']

    x_train, x_test_os, y_train, y_test_os = train_test_split(x, y, test_size=0.2, random_state=42)
    _x_train, x_test_is, _y_train, y_test_is = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

    sklearn_model = text_clf.fit(x_train, y_train)

    model = SkLearnClassifierModel(sklearn_model=sklearn_model, target_names=target_names)

    classifier_model = ClassifierModel()

    classifier_model.set_trained_model_obj(model)
    classifier_model.document_field = field

    predicted_os = text_clf.predict(x_test_os)
    predicted_is = text_clf.predict(x_test_is)

    classifier_model.classifier_accuracy_report_out_of_sample = classification_report(y_test_os,
                                                                                      predicted_os,
                                                                                      target_names=target_names)
    classifier_model.classifier_accuracy_report_in_sample = classification_report(y_test_is,
                                                                                  predicted_is,
                                                                                  target_names=target_names)

    return classifier_model