def train_model( cls, log: ProcessLogger, field: DocumentField, train_data_sets: List[List[dict]], split_and_log_out_of_sample_test_report: bool = False ) -> ClassifierModel: typed_field = TypedField.by(field) df = pd.DataFrame.from_records(train_data_sets.pop(0)) # add transferred external data for train_data in train_data_sets: df = df.append(pd.DataFrame.from_records(train_data)) df['target_name'] = df.apply(lambda row: encode_category( field.code, row.value if typed_field.is_choice_field else None, row.extraction_hint), axis=1) df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1 df = df.append([{ 'text_unit__textunittext__text': i } for i in cls.get_no_field_text_units( field.document_type, field.text_unit_type)]) df['target_index'] = df['target_index'].fillna(0).astype('int') df['target_name'] = df['target_name'].fillna( SkLearnClassifierModel.EMPTY_CAT_NAME).astype('str') df['user_input'] = df['modified_by'].fillna(0).astype('bool') res_df = pd.DataFrame() for group_index, group_df in df.groupby('target_index'): if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN: group_df = shuffle( group_df.sort_values('user_input', ascending=False) [:settings.ML_TRAIN_DATA_SET_GROUP_LEN]) res_df = res_df.append(group_df) res_df = shuffle(res_df) target_names = sorted(res_df['target_name'].unique()) if field.classifier_init_script: try: clf = cls.init_classifier(field) except Exception as e: log.error( f'Unable to initialize classifier for field {field.code}. ' f'Classifier init script: {field.classifier_init_script}', exc_info=e) else: clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, tol=None, n_jobs=-1, class_weight='balanced') log.info(f'Classifier initialized: {clf}') text_clf = Pipeline([ ('vect', CountVectorizer(strip_accents='unicode', analyzer='word', stop_words='english', tokenizer=word_position_tokenizer)), ('tfidf', TfidfTransformer()), ('clf', clf), ]) x = res_df['text_unit__textunittext__text'] y = res_df['target_index'] if split_and_log_out_of_sample_test_report: x_train, x_test_os, y_train, y_test_os = train_test_split( x, y, test_size=0.2, random_state=42) else: x_train, x_test_os, y_train, y_test_os = x, None, y, None sklearn_model = text_clf.fit(x_train, y_train) model = SkLearnClassifierModel(sklearn_model=sklearn_model, target_names=target_names) classifier_model = ClassifierModel() classifier_model.set_trained_model_obj(model) classifier_model.document_field = field classifier_model.classifier_accuracy_report_in_sample = \ classification_report(y, text_clf.predict(x), target_names=target_names) if y_test_os is not None and x_test_os is not None: classifier_model.classifier_accuracy_report_out_of_sample = \ classification_report(y_test_os, text_clf.predict(x_test_os), target_names=target_names) return classifier_model
def train_document_field_detector_model(cls, log: ProcessLogger, field: DocumentField, train_data_project_ids: Optional[List], use_only_confirmed_field_values: bool = False, train_documents: Iterable[Document] = None) \ -> Optional[ClassifierModel]: field_type_adapter = FIELD_TYPES_REGISTRY[ field.type] # type: FieldType log.set_progress_steps_number(7) log.info('Training model for field #{0} ({1})...'.format( field.pk, field.code)) # Classifier: values of dependencies -> value of this field # Field types supported: only choice fields if not isinstance(field_type_adapter, ChoiceField): raise ValueError( 'Field-based ML supports only choice fields but field {0} (#{1}) is of type {2}' .format(field.code, field.uid, field_type_adapter.code)) # Lets find good values of depends-on fields suitable for using as train data. if train_documents: train_data = list(train_documents.values_list('field_values', flat=True)) \ if hasattr(train_documents, 'values_list') \ else [doc.field_values for doc in train_documents] elif train_data_project_ids and not use_only_confirmed_field_values: train_data = list( Document.objects.filter(project_id__in=train_data_project_ids). order_by('id').values_list( 'field_values', flat=True)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN]) else: train_data = list(cls.get_user_data(field, train_data_project_ids)) if not train_data: raise RuntimeError( 'Not enough train data for field {0} (#{1}). ' 'Need at least {2} approved or changed documents of type {3}.'. format(field.code, field.uid, settings.ML_TRAIN_DATA_SET_GROUP_LEN, field.document_type.code)) depends_on_fields_types = cls.get_depends_on_uid_code_type(field) depends_on_fields_types = cls.remove_empty_fields( depends_on_fields_types, train_data) pipeline, feature_names_funcs = cls.build_pipeline( field, depends_on_fields_types) # type: Pipeline, List[Callable] categories = cls.get_categories(field) category_names_to_indexes = {c: i for i, c in enumerate(categories)} log.step_progress() log.info( 'Collecting feature rows from train and test documents in dict form...' ) # When tried to use sklearn shuffling something went wrong, leaving manual methods for a while. random.shuffle(train_data) # TODO: use sklearn methods for splitting train/test data and shuffling test_size = 0.2 train_feature_data = list() train_target_data = list() for doc_field_values in train_data: field_value = doc_field_values.get(field.uid) del doc_field_values[field.uid] field_value_idx = category_names_to_indexes.get( field_value) if field_value else None if field_value_idx is None: field_value_idx = len(categories) train_feature_data.append(doc_field_values) train_target_data.append(field_value_idx) is_index = math.floor(test_size * len(train_data)) test_oos_feature_data = train_feature_data[:is_index] test_oos_target_data = train_target_data[:is_index] train_feature_data = train_feature_data[is_index:] train_target_data = train_target_data[is_index:] test_is_feature_data = train_feature_data # [:is_index] test_is_target_data = train_target_data # [:is_index] log.step_progress() log.info('Training the model...') model = pipeline.fit(train_feature_data, train_target_data) log.step_progress() log.info('Testing the model...') cm = ClassifierModel() cm.document_field = field predicted_oos = pipeline.predict(test_oos_feature_data) cm.classifier_accuracy_report_out_of_sample = classification_report( test_oos_target_data, predicted_oos, target_names=categories) predicted_is = pipeline.predict(test_is_feature_data) cm.classifier_accuracy_report_in_sample = classification_report( test_is_target_data, predicted_is, target_names=categories) log.step_progress() log.info('Saving ClassifierModel instance...') feature_names = [] for f in feature_names_funcs: feature_names.extend(f()) cm.set_trained_model_obj({ 'model': model, 'categories': categories, 'feature_names': feature_names }) log.step_progress() log.info('Finished.') return cm
def train_model(field: DocumentField, train_data_sets: List[dict]) -> ClassifierModel: df = pd.DataFrame.from_records(train_data_sets.pop(0)) # add transferred external data for train_data in train_data_sets: df = df.append(pd.DataFrame.from_records(train_data)) df['target_name'] = df.apply(lambda row: encode_category( field.pk, row.value if field.is_choice_field() else None, row.extraction_hint), axis=1) df['target_index'] = df['target_name'].factorize(sort=True)[0] + 1 df = df.append([{'text_unit__text': i} for i in get_no_field_text_units(field.document_type, field.text_unit_type)]) df['target_index'] = df['target_index'].fillna(0).astype('int') df['target_name'] = df['target_name'].fillna(SkLearnClassifierModel.EMPTY_CAT_NAME).astype( 'str') df['user_input'] = df['created_by'].fillna(0).astype('bool') res_df = pd.DataFrame() for group_index, group_df in df.groupby('target_index'): if group_df.shape[0] > settings.ML_TRAIN_DATA_SET_GROUP_LEN: group_df = shuffle( group_df.sort_values('user_input', ascending=False)[:settings.ML_TRAIN_DATA_SET_GROUP_LEN]) res_df = res_df.append(group_df) res_df = shuffle(res_df) target_names = sorted(res_df['target_name'].unique()) text_clf = Pipeline([('vect', CountVectorizer(strip_accents='unicode', analyzer='word', stop_words='english', tokenizer=word_position_tokenizer)), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, tol=None, n_jobs=-1, class_weight='balanced')), ]) x = res_df['text_unit__text'] y = res_df['target_index'] x_train, x_test_os, y_train, y_test_os = train_test_split(x, y, test_size=0.2, random_state=42) _x_train, x_test_is, _y_train, y_test_is = train_test_split(x_train, y_train, test_size=0.2, random_state=42) sklearn_model = text_clf.fit(x_train, y_train) model = SkLearnClassifierModel(sklearn_model=sklearn_model, target_names=target_names) classifier_model = ClassifierModel() classifier_model.set_trained_model_obj(model) classifier_model.document_field = field predicted_os = text_clf.predict(x_test_os) predicted_is = text_clf.predict(x_test_is) classifier_model.classifier_accuracy_report_out_of_sample = classification_report(y_test_os, predicted_os, target_names=target_names) classifier_model.classifier_accuracy_report_in_sample = classification_report(y_test_is, predicted_is, target_names=target_names) return classifier_model