Esempio n. 1
0
    def train_model(cls, field: DocumentField, train_data_sets: List[List[dict]]) -> ClassifierModel:
        typed_field = TypedField.by(field)
        df = pd.DataFrame.from_records(train_data_sets.pop(0))
        # add transferred external data
        for train_data in train_data_sets:
            df = df.append(pd.DataFrame.from_records(train_data))

        df['target_name'] = df.apply(lambda row: encode_category(
            field.code,
            row.value if typed_field.is_choice_field else None,
            row.extraction_hint), axis=1)

        df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1

        df = df.append(
            [{'text_unit__textunittext__text': i} for i in cls.get_no_field_text_units(field.document_type, field.text_unit_type)])

        df['target_index'] = df['target_index'].fillna(0).astype('int')
        df['target_name'] = df['target_name'].fillna(SkLearnClassifierModel.EMPTY_CAT_NAME).astype(
            'str')
        df['user_input'] = df['created_by'].fillna(0).astype('bool')

        res_df = pd.DataFrame()

        for group_index, group_df in df.groupby('target_index'):
            if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN:
                group_df = shuffle(
                    group_df.sort_values('user_input', ascending=False)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN])
            res_df = res_df.append(group_df)
        res_df = shuffle(res_df)

        target_names = sorted(res_df['target_name'].unique())

        text_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', analyzer='word',
                                                      stop_words='english',
                                                      tokenizer=word_position_tokenizer)),
                             ('tfidf', TfidfTransformer()),
                             ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                                   alpha=1e-3, max_iter=5, tol=None, n_jobs=-1,
                                                   class_weight='balanced')),
                             ])
        x = res_df['text_unit__textunittext__text']
        y = res_df['target_index']

        x_train, x_test_os, y_train, y_test_os = train_test_split(x, y, test_size=0.2, random_state=42)
        _x_train, x_test_is, _y_train, y_test_is = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

        sklearn_model = text_clf.fit(x_train, y_train)

        model = SkLearnClassifierModel(sklearn_model=sklearn_model, target_names=target_names)

        classifier_model = ClassifierModel()

        classifier_model.set_trained_model_obj(model)
        classifier_model.document_field = field

        predicted_os = text_clf.predict(x_test_os)
        predicted_is = text_clf.predict(x_test_is)

        classifier_model.classifier_accuracy_report_out_of_sample = classification_report(y_test_os,
                                                                                          predicted_os,
                                                                                          target_names=target_names)
        classifier_model.classifier_accuracy_report_in_sample = classification_report(y_test_is,
                                                                                      predicted_is,
                                                                                      target_names=target_names)

        return classifier_model
Esempio n. 2
0
    def train_document_field_detector_model(cls,
                                            log: ProcessLogger,
                                            field: DocumentField,
                                            train_data_project_ids: Optional[List],
                                            use_only_confirmed_field_values: bool = False) \
            -> Optional[ClassifierModel]:

        typed_field = TypedField.by(field)  # type: TypedField

        log.set_progress_steps_number(7)
        log.info('Training model for field #{0} ({1})...'.format(
            field.pk, field.code))

        # Classifier: values of dependencies -> value of this field
        # Field types supported: only choice fields
        if not isinstance(typed_field, ChoiceField):
            raise ValueError(
                'Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}'
                .format(field.code, field.uid, typed_field.type_code))
        # Lets find good values of depends-on fields suitable for using as train data.

        categories = cls.get_categories(field)

        train_data = cls.get_train_values(field, train_data_project_ids,
                                          set(categories),
                                          use_only_confirmed_field_values)

        if not train_data:
            raise RuntimeError(
                'Not enough train data for field {0} (#{1}). '
                'Need at least {2} approved or changed documents of type {3}.'.
                format(field.code, field.uid,
                       settings.ML_TRAIN_DATA_SET_GROUP_LEN,
                       field.document_type.code))

        depends_on_code_types = cls.get_depends_on_code_type(field)
        depends_on_code_types = cls.remove_empty_fields(
            depends_on_code_types, train_data)

        pipeline, feature_names_funcs = cls.build_pipeline(
            field, depends_on_code_types)  # type: Pipeline, List[Callable]

        category_names_to_indexes = {c: i for i, c in enumerate(categories)}

        log.step_progress()
        log.info(
            'Collecting feature rows from train and test documents in dict form...'
        )

        #  When tried to use sklearn shuffling something went wrong, leaving manual methods for a while.
        random.shuffle(train_data)

        #  TODO: use sklearn methods for splitting train/test data and shuffling

        test_size = 0.2

        train_feature_data = list()
        train_target_data = list()

        for doc_field_values in train_data:
            field_value = doc_field_values.get(field.code)
            del doc_field_values[field.code]

            field_value_idx = category_names_to_indexes.get(
                field_value) if field_value else None

            if field_value_idx is None:
                field_value_idx = len(categories)

            train_feature_data.append(doc_field_values)
            train_target_data.append(field_value_idx)

        is_index = math.floor(test_size * len(train_data))

        test_oos_feature_data = train_feature_data[:is_index]
        test_oos_target_data = train_target_data[:is_index]

        train_feature_data = train_feature_data[is_index:]
        train_target_data = train_target_data[is_index:]

        test_is_feature_data = train_feature_data  # [:is_index]
        test_is_target_data = train_target_data  # [:is_index]

        log.step_progress()
        log.info('Training the model...')
        model = pipeline.fit(train_feature_data, train_target_data)

        log.step_progress()

        log.info('Testing the model...')
        cm = ClassifierModel()
        cm.document_field = field

        predicted_oos = pipeline.predict(test_oos_feature_data)
        cm.classifier_accuracy_report_out_of_sample = classification_report(
            test_oos_target_data, predicted_oos, target_names=categories)

        predicted_is = pipeline.predict(test_is_feature_data)
        cm.classifier_accuracy_report_in_sample = classification_report(
            test_is_target_data, predicted_is, target_names=categories)

        log.step_progress()
        log.info('Saving ClassifierModel instance...')

        feature_names = []
        for f in feature_names_funcs:
            feature_names.extend(f())

        cm.set_trained_model_obj({
            'model': model,
            'categories': categories,
            'feature_names': feature_names
        })
        log.step_progress()
        log.info('Finished.')
        return cm