def train_model(cls, field: DocumentField, train_data_sets: List[List[dict]]) -> ClassifierModel: typed_field = TypedField.by(field) df = pd.DataFrame.from_records(train_data_sets.pop(0)) # add transferred external data for train_data in train_data_sets: df = df.append(pd.DataFrame.from_records(train_data)) df['target_name'] = df.apply(lambda row: encode_category( field.code, row.value if typed_field.is_choice_field else None, row.extraction_hint), axis=1) df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1 df = df.append( [{'text_unit__textunittext__text': i} for i in cls.get_no_field_text_units(field.document_type, field.text_unit_type)]) df['target_index'] = df['target_index'].fillna(0).astype('int') df['target_name'] = df['target_name'].fillna(SkLearnClassifierModel.EMPTY_CAT_NAME).astype( 'str') df['user_input'] = df['created_by'].fillna(0).astype('bool') res_df = pd.DataFrame() for group_index, group_df in df.groupby('target_index'): if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN: group_df = shuffle( group_df.sort_values('user_input', ascending=False)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN]) res_df = res_df.append(group_df) res_df = shuffle(res_df) target_names = sorted(res_df['target_name'].unique()) text_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', analyzer='word', stop_words='english', tokenizer=word_position_tokenizer)), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, tol=None, n_jobs=-1, class_weight='balanced')), ]) x = res_df['text_unit__textunittext__text'] y = res_df['target_index'] x_train, x_test_os, y_train, y_test_os = train_test_split(x, y, test_size=0.2, random_state=42) _x_train, x_test_is, _y_train, y_test_is = train_test_split(x_train, y_train, test_size=0.2, random_state=42) sklearn_model = text_clf.fit(x_train, y_train) model = SkLearnClassifierModel(sklearn_model=sklearn_model, target_names=target_names) classifier_model = ClassifierModel() classifier_model.set_trained_model_obj(model) classifier_model.document_field = field predicted_os = text_clf.predict(x_test_os) predicted_is = text_clf.predict(x_test_is) classifier_model.classifier_accuracy_report_out_of_sample = classification_report(y_test_os, predicted_os, target_names=target_names) classifier_model.classifier_accuracy_report_in_sample = classification_report(y_test_is, predicted_is, target_names=target_names) return classifier_model
def train_document_field_detector_model(cls, log: ProcessLogger, field: DocumentField, train_data_project_ids: Optional[List], use_only_confirmed_field_values: bool = False) \ -> Optional[ClassifierModel]: typed_field = TypedField.by(field) # type: TypedField log.set_progress_steps_number(7) log.info('Training model for field #{0} ({1})...'.format( field.pk, field.code)) # Classifier: values of dependencies -> value of this field # Field types supported: only choice fields if not isinstance(typed_field, ChoiceField): raise ValueError( 'Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}' .format(field.code, field.uid, typed_field.type_code)) # Lets find good values of depends-on fields suitable for using as train data. categories = cls.get_categories(field) train_data = cls.get_train_values(field, train_data_project_ids, set(categories), use_only_confirmed_field_values) if not train_data: raise RuntimeError( 'Not enough train data for field {0} (#{1}). ' 'Need at least {2} approved or changed documents of type {3}.'. format(field.code, field.uid, settings.ML_TRAIN_DATA_SET_GROUP_LEN, field.document_type.code)) depends_on_code_types = cls.get_depends_on_code_type(field) depends_on_code_types = cls.remove_empty_fields( depends_on_code_types, train_data) pipeline, feature_names_funcs = cls.build_pipeline( field, depends_on_code_types) # type: Pipeline, List[Callable] category_names_to_indexes = {c: i for i, c in enumerate(categories)} log.step_progress() log.info( 'Collecting feature rows from train and test documents in dict form...' ) # When tried to use sklearn shuffling something went wrong, leaving manual methods for a while. random.shuffle(train_data) # TODO: use sklearn methods for splitting train/test data and shuffling test_size = 0.2 train_feature_data = list() train_target_data = list() for doc_field_values in train_data: field_value = doc_field_values.get(field.code) del doc_field_values[field.code] field_value_idx = category_names_to_indexes.get( field_value) if field_value else None if field_value_idx is None: field_value_idx = len(categories) train_feature_data.append(doc_field_values) train_target_data.append(field_value_idx) is_index = math.floor(test_size * len(train_data)) test_oos_feature_data = train_feature_data[:is_index] test_oos_target_data = train_target_data[:is_index] train_feature_data = train_feature_data[is_index:] train_target_data = train_target_data[is_index:] test_is_feature_data = train_feature_data # [:is_index] test_is_target_data = train_target_data # [:is_index] log.step_progress() log.info('Training the model...') model = pipeline.fit(train_feature_data, train_target_data) log.step_progress() log.info('Testing the model...') cm = ClassifierModel() cm.document_field = field predicted_oos = pipeline.predict(test_oos_feature_data) cm.classifier_accuracy_report_out_of_sample = classification_report( test_oos_target_data, predicted_oos, target_names=categories) predicted_is = pipeline.predict(test_is_feature_data) cm.classifier_accuracy_report_in_sample = classification_report( test_is_target_data, predicted_is, target_names=categories) log.step_progress() log.info('Saving ClassifierModel instance...') feature_names = [] for f in feature_names_funcs: feature_names.extend(f()) cm.set_trained_model_obj({ 'model': model, 'categories': categories, 'feature_names': feature_names }) log.step_progress() log.info('Finished.') return cm