예제 #1
0
def train_model(document_type: DocumentType, field: DocumentField, train_data_sets: List[dict]) -> ClassifierModel:
    df = pd.DataFrame.from_records(train_data_sets.pop(0))
    # add transferred external data
    for train_data in train_data_sets:
        df = df.append(pd.DataFrame.from_records(train_data))

    df['target_name'] = df.apply(lambda row: encode_category(
        field.pk,
        row.value if field.is_choice_field() else None,
        row.extraction_hint), axis=1)

    df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1

    df = df.append([{'text_unit__text': i} for i in get_no_field_text_units(document_type, field)])

    df['target_index'] = df['target_index'].fillna(0).astype('int')
    df['target_name'] = df['target_name'].fillna(SkLearnClassifierModel.EMPTY_CAT_NAME).astype(
        'str')
    df['user_input'] = df['created_by'].fillna(0).astype('bool')

    res_df = pd.DataFrame()

    for group_index, group_df in df.groupby('target_index'):
        if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN:
            group_df = shuffle(
                group_df.sort_values('user_input', ascending=False)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
        res_df = res_df.append(group_df)
    res_df = shuffle(res_df)

    target_names = sorted(res_df['target_name'].unique())

    text_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', analyzer='word',
                                                  stop_words='english',
                                                  tokenizer=word_position_tokenizer)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                               alpha=1e-3, max_iter=5, tol=None, n_jobs=-1,
                                               class_weight='balanced')),
                         ])
    x = res_df['text_unit__text']
    y = res_df['target_index']

    x_train, x_test_os, y_train, y_test_os = train_test_split(x, y, test_size=0.2, random_state=42)
    _x_train, x_test_is, _y_train, y_test_is = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

    sklearn_model = text_clf.fit(x_train, y_train)

    model = SkLearnClassifierModel(sklearn_model=sklearn_model, target_names=target_names)

    classifier_model = ClassifierModel()

    classifier_model.set_trained_model_obj(model)
    classifier_model.document_field = field
    classifier_model.document_type = document_type

    predicted_os = text_clf.predict(x_test_os)
    predicted_is = text_clf.predict(x_test_is)

    classifier_model.classifier_accuracy_report_out_of_sample = classification_report(y_test_os,
                                                                                      predicted_os,
                                                                                      target_names=target_names)
    classifier_model.classifier_accuracy_report_in_sample = classification_report(y_test_is,
                                                                                  predicted_is,
                                                                                  target_names=target_names)

    return classifier_model
    def train_document_field_detector_model(
        cls,
        log: ProcessLogger,
        document_type: DocumentType,
        field: DocumentField,
        train_data_project_ids: Optional[List],
        use_only_confirmed_field_values: bool = False
    ) -> Optional[ClassifierModel]:

        field_type_adapter = FIELD_TYPES_REGISTRY[
            field.type]  # type: FieldType

        log.set_progress_steps_number(7)
        log.info('Training model for field #{0} ({1})...'.format(
            field.pk, field.code))

        # Classifier: values of dependencies -> value of this field
        # Field types supported: only choice fields
        if not isinstance(field_type_adapter, ChoiceField):
            raise ValueError(
                'Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}'
                .format(field.code, field.uid, field_type_adapter.code))
        # Lets find good values of depends-on fields suitable for using as train data.

        if train_data_project_ids and not use_only_confirmed_field_values:
            train_data = list(Document.objects \
                              .filter(project_id__in=train_data_project_ids) \
                              .values_list('field_values', flat=True)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
        else:
            train_data = list(
                cls.get_user_data(document_type, field,
                                  train_data_project_ids))

        if not train_data:
            raise RuntimeError(
                'Not enough train data for field {0} (#{1}). '
                'Need at least {2} approved or changed documents of type {3}.'.
                format(field.code, field.uid,
                       settings.ML_TRAIN_DATA_SET_GROUP_LEN,
                       document_type.code))

        depends_on_fields_types = cls.get_depends_on_uid_code_type(field)
        depends_on_fields_types = cls.remove_empty_fields(
            depends_on_fields_types, train_data)

        pipeline, feature_names_funcs = cls.build_pipeline(
            field, depends_on_fields_types)  # type: Pipeline, List[Callable]

        categories = sorted([c.strip() for c in field.choices.split('\n')])
        category_names_to_indexes = {c: i for i, c in enumerate(categories)}

        log.step_progress()
        log.info(
            'Collecting feature rows from train and test documents in dict form...'
        )

        #  When tried to use sklearn shuffling something went wrong, leaving manual methods for a while.
        random.shuffle(train_data)

        #  TODO: use sklearn methods for splitting train/test data and shuffling

        test_size = 0.2

        train_feature_data = list()
        train_target_data = list()

        for doc_field_values in train_data:
            field_value = doc_field_values.get(field.uid)
            del doc_field_values[field.uid]

            field_value_idx = category_names_to_indexes.get(
                field_value) if field_value else None
            if field_value_idx is None:
                field_value_idx = len(categories)

            train_feature_data.append(doc_field_values)
            train_target_data.append(field_value_idx)

        is_index = math.floor(test_size * len(train_data))

        test_oos_feature_data = train_feature_data[:is_index]
        test_oos_target_data = train_target_data[:is_index]

        train_feature_data = train_feature_data[is_index:]
        train_target_data = train_target_data[is_index:]

        test_is_feature_data = train_feature_data[:is_index]
        test_is_target_data = train_target_data[:is_index]

        log.step_progress()
        log.info('Training the model...')
        model = pipeline.fit(train_feature_data, train_target_data)

        log.step_progress()

        log.info('Testing the model...')
        cm = ClassifierModel()
        cm.document_type = document_type
        cm.document_field = field

        predicted_oos = pipeline.predict(test_oos_feature_data)
        cm.classifier_accuracy_report_out_of_sample = classification_report(
            test_oos_target_data, predicted_oos, target_names=categories)

        predicted_is = pipeline.predict(test_is_feature_data)
        cm.classifier_accuracy_report_in_sample = classification_report(
            test_is_target_data, predicted_is, target_names=categories)

        log.step_progress()
        log.info('Saving ClassifierModel instance...')

        feature_names = []
        for f in feature_names_funcs:
            feature_names.extend(f())

        cm.set_trained_model_obj({
            'model': model,
            'categories': categories,
            'feature_names': feature_names
        })
        log.step_progress()
        log.info('Finished.')
        return cm