class RegexTaggerSerializer(FieldParseSerializer, serializers.ModelSerializer, ProjectResourceUrlSerializer): description = serializers.CharField() author = UserSerializer(read_only=True) lexicon = serializers.ListField(child=serializers.CharField(required=True), validators=[validate_patterns], help_text="Words/phrases/regex patterns to match.") counter_lexicon = serializers.ListField(child=serializers.CharField(required=False), default=[], validators=[validate_patterns], help_text="Words/phrases/regex patterns to nullify lexicon matches. Default = [].") operator = serializers.ChoiceField(default=choices.DEFAULT_OPERATOR, choices=choices.OPERATOR_CHOICES, required=False, help_text=f"Logical operation between lexicon entries. Choices = {choices.OPERATOR_CHOICES}. Default = {choices.DEFAULT_OPERATOR}.") match_type = serializers.ChoiceField(default=choices.DEFAULT_MATCH_TYPE, choices=choices.MATCH_TYPE_CHOICES, required=False, help_text=f"How to match lexicon entries to text. Choices = {choices.SUPPORTED_MATCH_TYPES}. Default= {choices.DEFAULT_MATCH_TYPE}.") required_words = serializers.FloatField(default=choices.DEFAULT_REQUIRED_WORDS, required=False, help_text=f"Required ratio of lexicon entries matched in text for returning a positive result. NB! Only takes effect if operator=='and'. Default = {choices.DEFAULT_REQUIRED_WORDS}.") phrase_slop = serializers.IntegerField(default=choices.DEFAULT_PHRASE_SLOP, required=False, help_text=f"Number of non-lexicon words allowed between the words of one lexicon entry. Default = {choices.DEFAULT_PHRASE_SLOP}.") counter_slop = serializers.IntegerField(default=choices.DEFAULT_COUNTER_SLOP, required=False, help_text=f"Number of words allowed between lexicon entries and counter lexicon entries for the latter to have effect. Default = {choices.DEFAULT_COUNTER_SLOP}") n_allowed_edits = serializers.IntegerField(default=choices.DEFAULT_N_ALLOWED_EDITS, required=False, help_text=f"Number of allowed character changes between lexicon entries and candidate matches in text. Default = {choices.DEFAULT_N_ALLOWED_EDITS}.") return_fuzzy_match = serializers.BooleanField(default=choices.DEFAULT_RETURN_FUZZY_MATCH, required=False, help_text=f"Return fuzzy match (opposed to exact lexicon entry)? Default = {choices.DEFAULT_RETURN_FUZZY_MATCH}.") ignore_case = serializers.BooleanField(default=choices.DEFAULT_IGNORE_CASE, required=False, help_text=f"Ignore case while matching? Default = {choices.DEFAULT_IGNORE_CASE}.") ignore_punctuation = serializers.BooleanField(default=choices.DEFAULT_IGNORE_PUNCTUATION, required=False, help_text=f"If set False, end-of-sentence characters between lexicon entry words and/or counter lexicon entries, nullify the effect. Default = {choices.DEFAULT_IGNORE_PUNCTUATION}.") url = serializers.SerializerMethodField() tagger_groups = serializers.SerializerMethodField(read_only=True) task = TaskSerializer(read_only=True) def get_tagger_groups(self, value: RegexTagger): tgs = RegexTaggerGroup.objects.filter(regex_taggers__project_id=value.project.pk, regex_taggers__id=value.pk) descriptions = [{"tagger_group_id": tagger.pk, "description": tagger.description} for tagger in tgs] return descriptions class Meta: model = RegexTagger fields = ('id', 'url', 'author', 'description', 'lexicon', 'counter_lexicon', 'operator', 'match_type', 'required_words', 'phrase_slop', 'counter_slop', 'n_allowed_edits', 'return_fuzzy_match', 'ignore_case', 'ignore_punctuation', 'phrase_slop', 'counter_slop', 'n_allowed_edits', 'return_fuzzy_match', 'ignore_case', 'ignore_punctuation', 'tagger_groups', 'task') fields_to_parse = ('lexicon', 'counter_lexicon')
class RegexTaggerGroupSerializer(serializers.ModelSerializer, ProjectResourceUrlSerializer): description = serializers.CharField() url = serializers.SerializerMethodField() task = TaskSerializer(read_only=True) author = UserSerializer(read_only=True) tagger_info = serializers.SerializerMethodField(read_only=True) # Helper field for displaying tagger info in a friendly manner. # Ensure that only Regex Taggers inside the same Project are returned. def get_fields(self, *args, **kwargs): fields = super(RegexTaggerGroupSerializer, self).get_fields(*args, **kwargs) project_pk = self.context["view"].kwargs["project_pk"] fields['regex_taggers'].queryset = RegexTagger.objects.filter(project__pk=project_pk) return fields def get_tagger_info(self, value: RegexTaggerGroup): queryset = value.regex_taggers.filter(project__pk=value.pk) serializer = RegexTaggerSerializer(queryset, many=True, context={"request": self.context["request"]}) return serializer.data class Meta: model = RegexTaggerGroup # regex_taggers is the field which to use to manipulate the related RegexTagger model objects. fields = ('id', 'url', 'regex_taggers', 'author', 'task', 'description', 'tagger_info')
class RakunExtractorSerializer(FieldParseSerializer, serializers.ModelSerializer, ProjectResourceUrlSerializer, IndicesSerializerMixin): author_username = serializers.CharField(source="author.profile.get_display_name", read_only=True) description = serializers.CharField(required=True, help_text=serializer_constants.DESCRIPTION_HELPTEXT) distance_method = serializers.ChoiceField(choices=choices.DEFAULT_DISTANCE_METHOD_CHOICES, default=choices.DEFAULT_DISTANCE_METHOD, help_text="Method for merging similar tokens.") distance_threshold = serializers.FloatField(required=False, min_value=0.0, default=2.0, help_text="Distance between tokens that initiates the merge process (if more similar than this, the tokens are merged).") num_keywords = serializers.IntegerField(required=False, default=25, help_text="The number of keywords to be detected") pair_diff_length = serializers.IntegerField(required=False, default=2, help_text="If the difference in the length of the two tokens is smaller than this parameter, the tokens are considered for merging.") stopwords = serializers.ListField(child=serializers.CharField(required=False), required=False, default=[], help_text="Words to ignore as possible keywords.") bigram_count_threshold = serializers.IntegerField(required=False, default=2, help_text="Minimum edge weight for constituting a bigram.") min_tokens = serializers.IntegerField(required=False, min_value=1, max_value=3, default=1, help_text="The minimum number of tokens that can constitute a keyword.") max_tokens = serializers.IntegerField(required=False, min_value=1, max_value=3, default=1, help_text="The maximum number of tokens that can constitute a keyword.") max_similar = serializers.IntegerField(required=False, default=3, help_text="How many similar keywords are permitted. For example, 'british vote' and 'british parliament' would be considered similar (overlap of at least one token).") max_occurrence = serializers.IntegerField(required=False, default=3, help_text="How many of the most common keywords are to be considered during max_similar prunning step.") fasttext_embedding = ProjectFasttextFilteredPrimaryKeyRelatedField(queryset=Embedding.objects, many=False, read_only=False, allow_null=True, default=None, help_text=f'FastText embedding to use.') url = serializers.SerializerMethodField() task = TaskSerializer(read_only=True) class Meta: model = RakunExtractor fields = ('id', 'url', 'author_username', 'description', 'distance_method', 'distance_threshold', 'num_keywords', 'pair_diff_length', 'stopwords', 'bigram_count_threshold', 'min_tokens', 'max_tokens', 'max_similar', 'max_occurrence', 'fasttext_embedding', 'task') fields_to_parse = () def validate(self, data): if data.get("distance_method", None) == "fasttext": if data.get("distance_threshold", None) > 1: raise ValidationError("Value 'distance_threshold' should not be greater than one if 'distance_method' is fasttext!") return data def to_representation(self, instance: RakunExtractor): data = super(RakunExtractorSerializer, self).to_representation(instance) data.pop("stopwords") return data
class SearchFieldsTaggerSerializer(serializers.ModelSerializer, FieldsValidationSerializerMixin, IndicesSerializerMixin): author = UserSerializer(read_only=True) description = serializers.CharField(help_text=DESCRIPTION_HELPTEXT) task = TaskSerializer(read_only=True, required=False) url = serializers.SerializerMethodField() query = serializers.JSONField(help_text=QUERY_HELPTEXT, default=EMPTY_QUERY) fields = serializers.ListField(child=serializers.CharField(), required=True, help_text=FIELDS_HELPTEXT) fact_name = serializers.CharField( help_text="What name should the newly created facts have.") use_breakup = serializers.BooleanField( default=True, help_text= "Whether to split the text into multiple facts by the breakup character." ) breakup_character = serializers.CharField( default="\n", help_text= "Which text/symbol to use to split the text into separate facts.", trim_whitespace=False) bulk_size = serializers.IntegerField(min_value=1, default=100, help_text=BULK_SIZE_HELPTEXT) es_timeout = serializers.IntegerField(min_value=1, default=10, help_text=ES_TIMEOUT_HELPTEXT) class Meta: model = SearchFieldsTagger fields = ("id", "url", "author", "indices", "description", "use_breakup", "breakup_character", "task", "query", "fields", "fact_name", "bulk_size", "es_timeout") def get_url(self, obj): default_version = REST_FRAMEWORK.get("DEFAULT_VERSION") index = reverse(f"{default_version}:search_fields_tagger-detail", kwargs={ "project_pk": obj.project.pk, "pk": obj.pk }) if "request" in self.context: request = self.context["request"] url = request.build_absolute_uri(index) return url else: return None def to_representation(self, instance: SearchFieldsTagger): data = super(SearchFieldsTaggerSerializer, self).to_representation(instance) data["fields"] = json.loads(instance.fields) data["query"] = json.loads(instance.query) return data
class SummarizerIndexSerializer(FieldParseSerializer, serializers.ModelSerializer, IndicesSerializerMixin): author = UserSerializer(read_only=True) description = serializers.CharField() task = TaskSerializer(read_only=True, required=False) url = serializers.SerializerMethodField() query = serializers.JSONField(help_text='Query in JSON format', required=False) algorithm = serializers.MultipleChoiceField(choices=list( DefaultSummarizerValues.SUPPORTED_ALGORITHMS), default=["lexrank"]) fields = serializers.ListField(child=serializers.CharField(), required=True) ratio = serializers.DecimalField( max_digits=3, decimal_places=1, default=0.2, min_value=Decimal('0.1'), max_value=99.9, help_text= "Min value 0.1, Max value 99.9 anything above 1.0 will be calculated as sentence count." ) class Meta: model = Summarizer fields = ("id", "url", "author", "indices", "description", "task", "query", "fields", "algorithm", "ratio") fields_to_parse = ['fields'] def get_url(self, obj): default_version = REST_FRAMEWORK.get("DEFAULT_VERSION") index = reverse(f"{default_version}:summarizer_index-detail", kwargs={ "project_pk": obj.project.pk, "pk": obj.pk }) if "request" in self.context: request = self.context["request"] url = request.build_absolute_uri(index) return url else: return None def to_representation(self, instance: Summarizer): data = super(SummarizerIndexSerializer, self).to_representation(instance) data["fields"] = json.loads(instance.fields) data["query"] = instance.query data["algorithm"] = instance.algorithm return data
class MLPWorkerSerializer(serializers.ModelSerializer, IndicesSerializerMixin, FieldsValidationSerializerMixin): author = UserSerializer(read_only=True) description = serializers.CharField() task = TaskSerializer(read_only=True, required=False) url = serializers.SerializerMethodField() query = serializers.JSONField(help_text='Query in JSON format', required=False, default=json.dumps(EMPTY_QUERY)) fields = serializers.ListField( child=serializers.CharField(), required=True, allow_empty=False, help_text="Which fields to apply the MLP on.") analyzers = serializers.MultipleChoiceField( choices=list(SUPPORTED_ANALYZERS), default=["all"]) es_scroll_size = serializers.IntegerField( help_text="Scroll size for Elasticsearch (Default: 100)", default=100, required=False) es_timeout = serializers.IntegerField( help_text="Scroll timeout in minutes for Elasticsearch (Default: 60)", default=60, required=False) class Meta: model = MLPWorker fields = ("id", "url", "author", "indices", "description", "task", "query", "fields", "analyzers", "es_scroll_size", "es_timeout") def get_url(self, obj): default_version = REST_FRAMEWORK.get("DEFAULT_VERSION") index = reverse(f"{default_version}:mlp_index-detail", kwargs={ "project_pk": obj.project.pk, "pk": obj.pk }) if "request" in self.context: request = self.context["request"] url = request.build_absolute_uri(index) return url else: return None def to_representation(self, instance: MLPWorker): data = super(MLPWorkerSerializer, self).to_representation(instance) data["fields"] = json.loads(instance.fields) data["query"] = json.loads(instance.query) data["analyzers"] = json.loads(instance.analyzers) return data
class EmbeddingSerializer(FieldParseSerializer, serializers.HyperlinkedModelSerializer, ProjectResourceUrlSerializer, IndicesSerializerMixin): author = UserSerializer(read_only=True) task = TaskSerializer(read_only=True) fields = serializers.ListField( child=serializers.CharField(), help_text=f'Fields used to build the model.') snowball_language = serializers.ChoiceField( choices=get_snowball_choices(), default=DEFAULT_SNOWBALL_LANGUAGE, help_text= f'Uses Snowball stemmer with specified language to normalize the texts. Default: {DEFAULT_SNOWBALL_LANGUAGE}' ) max_documents = serializers.IntegerField( default=choices.DEFAULT_MAX_DOCUMENTS) num_dimensions = serializers.IntegerField( default=choices.DEFAULT_NUM_DIMENSIONS, help_text=f'Default: {choices.DEFAULT_NUM_DIMENSIONS}') min_freq = serializers.IntegerField( default=choices.DEFAULT_MIN_FREQ, help_text=f'Default: {choices.DEFAULT_MIN_FREQ}') window_size = serializers.IntegerField( min_value=1, default=5, help_text= "Maximum distance between the current and predicted word within a sentence." ) num_epochs = serializers.IntegerField( min_value=1, default=5, help_text="Number of iterations (epochs) over the corpus.") use_phraser = serializers.BooleanField(default=True, help_text='Phrase input texts.') query = serializers.JSONField(help_text='Query in JSON format', required=False) url = serializers.SerializerMethodField() embedding_type = serializers.ChoiceField( choices=choices.EMBEDDING_CHOICES, default=choices.EMBEDDING_CHOICES[0][0]) class Meta: model = Embedding fields = ('id', 'url', 'author', 'description', 'indices', 'fields', 'use_phraser', 'embedding_type', 'snowball_language', 'query', 'num_dimensions', 'max_documents', 'min_freq', 'window_size', 'num_epochs', 'vocab_size', 'task') read_only_fields = ('vocab_size', ) fields_to_parse = ('fields', )
class DatasetImportSerializer(FieldParseSerializer, serializers.HyperlinkedModelSerializer, ProjectResourceUrlSerializer): author = UserSerializer(read_only=True) file = serializers.FileField(help_text='File to upload.', write_only=True) separator = serializers.CharField(help_text='Separator (CSV only).', required=False) index = serializers.CharField( help_text= 'Name of the Elasticsearch index to upload the documents into. Must be all lowercase and only consist of alphabetical and numerical values.', validators=[ check_for_upper_case, check_for_banned_beginning_chars, check_for_special_symbols, check_for_colons, check_for_wildcards ]) task = TaskSerializer(read_only=True) url = serializers.SerializerMethodField() class Meta: model = DatasetImport fields = ('id', 'url', 'author', 'description', 'index', 'separator', 'num_documents', 'num_documents_success', 'file', 'task') fields_to_parse = () read_only_fields = ('id', 'author', 'num_documents', 'num_documents_success', 'task')
class BertTaggerSerializer(FieldParseSerializer, serializers.ModelSerializer, IndicesSerializerMixin, ProjectResourceUrlSerializer): author = UserSerializer(read_only=True) fields = serializers.ListField(child=serializers.CharField(), help_text='Fields used to build the model.') query = serializers.JSONField(required=False, help_text='Query in JSON format', default=json.dumps(EMPTY_QUERY)) fact_name = serializers.CharField(default=None, required=False, help_text='Fact name used to filter tags (fact values). Default = None') pos_label = serializers.CharField(default="", required=False, allow_blank=True, help_text='Fact value used as positive label while evaluating the results. This is needed only, if the selected fact has exactly two possible values. Default = ""') use_gpu = serializers.BooleanField(default=True, help_text="Whether to force the usage of a GPU or not.") checkpoint_model = ProjectFilteredPrimaryKeyRelatedField(queryset=BertTagger.objects, many=False, read_only=False, allow_null=True, default=None, help_text=f'Previously fine-tuned BERT model. Select this, if you wish to further fine-tune it with additional data and/or new parameters. Default = None') maximum_sample_size = serializers.IntegerField(default=choices.DEFAULT_MAX_SAMPLE_SIZE, required=False, help_text=f'Maximum number of positive examples. Default = {choices.DEFAULT_MAX_SAMPLE_SIZE}') minimum_sample_size = serializers.IntegerField(default=choices.DEFAULT_MIN_SAMPLE_SIZE, required=False, help_text=f'Minimum number of negative examples. Default = {choices.DEFAULT_MIN_SAMPLE_SIZE}') negative_multiplier = serializers.FloatField(default=choices.DEFAULT_NEGATIVE_MULTIPLIER, required=False, help_text=f'Default={choices.DEFAULT_NEGATIVE_MULTIPLIER}') bert_model = serializers.CharField(default=choices.DEFAULT_BERT_MODEL, required=False, help_text=f'Pretrained BERT model to use. Default = {choices.DEFAULT_BERT_MODEL}') num_epochs = serializers.IntegerField(default=choices.DEFAULT_NUM_EPOCHS, required=False, help_text=f'Number of training epochs. Default = {choices.DEFAULT_NUM_EPOCHS}') max_length = serializers.IntegerField(default=choices.DEFAULT_MAX_LENGTH, required=False, min_value=1, max_value=512, help_text=f'Maximum sequence length of BERT tokenized input text used for training. Default = {choices.DEFAULT_MAX_LENGTH}') batch_size = serializers.IntegerField(default=choices.DEFAULT_BATCH_SIZE, required=False, help_text=f'Batch size used for training. NB! Autoscaled based on max length if too large. Default = {choices.DEFAULT_BATCH_SIZE}') split_ratio = serializers.FloatField(default=choices.DEFAULT_TRAINING_SPLIT, required=False, help_text=f'Proportion of documents used for training; others are used for validation. Default = {choices.DEFAULT_TRAINING_SPLIT}') learning_rate = serializers.FloatField(default=choices.DEFAULT_LEARNING_RATE, required=False, help_text=f'Learning rate used while training. Default = {choices.DEFAULT_LEARNING_RATE}') eps = serializers.FloatField(default=choices.DEFAULT_EPS, help_text=f'Default = {choices.DEFAULT_EPS}') balance = serializers.BooleanField(default=choices.DEFAULT_BALANCE, required=False, help_text=f'Balance sample sizes of different classes. Only applicable for multiclass taggers. Default = {choices.DEFAULT_BALANCE}') use_sentence_shuffle = serializers.BooleanField(default=choices.DEFAULT_USE_SENTENCE_SHUFFLE, required=False, help_text=f'Shuffle sentences in added examples. NB! Only applicable for multiclass taggers with balance=True. Default = {choices.DEFAULT_USE_SENTENCE_SHUFFLE}') balance_to_max_limit = serializers.BooleanField(default=choices.DEFAULT_BALANCE_TO_MAX_LIMIT, required=False, help_text=f'If enabled, the number of samples for each class is set to `maximum_sample_size`. Otherwise, it is set to max class size. NB! Only applicable for multiclass taggers with balance=True. Default = {choices.DEFAULT_BALANCE_TO_MAX_LIMIT}') task = TaskSerializer(read_only=True) plot = serializers.SerializerMethodField() url = serializers.SerializerMethodField() def validate_bert_model(self, bert_model): available_models = get_downloaded_bert_models(BERT_PRETRAINED_MODEL_DIRECTORY) if not bert_model in available_models: if ALLOW_BERT_MODEL_DOWNLOADS: raise serializers.ValidationError(f"Model '{bert_model}' is not downloaded. Please download it first via action 'Download pretrained model'. Currently available models: {available_models}.") else: raise serializers.ValidationError(f"Model '{bert_model}' is not downloaded. Downloading models via API is disabled. Please contact you system administrator to make it available. Currently available models: {available_models}.") return bert_model def validate(self, data): # use custom validation for pos label as some other serializer fields are also required data = validate_pos_label(data) return data class Meta: model = BertTagger fields = ('url', 'author', 'id', 'description', 'query', 'fields', 'use_gpu', 'f1_score', 'precision', 'recall', 'accuracy', 'validation_loss', 'training_loss', 'maximum_sample_size', 'minimum_sample_size', 'num_epochs', 'plot', 'task', 'pos_label', 'fact_name', 'indices', 'bert_model', 'learning_rate', 'eps', 'max_length', 'batch_size', 'adjusted_batch_size', 'split_ratio', 'negative_multiplier', 'checkpoint_model', 'num_examples', 'confusion_matrix', 'balance', 'use_sentence_shuffle', 'balance_to_max_limit', 'classes') read_only_fields = ('project', 'fields', 'f1_score', 'precision', 'recall', 'accuracy', 'validation_loss', 'training_loss', 'plot', 'task', 'num_examples', 'adjusted_batch_size', 'confusion_matrix', 'classes') fields_to_parse = ['fields', 'classes']
class ReindexerCreateSerializer(FieldParseSerializer, serializers.HyperlinkedModelSerializer, ProjectResourceUrlSerializer): author = UserSerializer(read_only=True) url = serializers.SerializerMethodField() scroll_size = serializers.IntegerField( min_value=0, max_value=10000, required=False ) # Max value stems from Elasticsearch max doc count limitation. description = serializers.CharField( help_text='Describe your re-indexing task', required=True, allow_blank=False) indices = serializers.ListField( child=serializers.CharField(), help_text=f'Add the indices, you wish to reindex into a new index.', required=True) query = serializers.JSONField( help_text='Add a query, if you wish to filter the new reindexed index.', required=False) new_index = serializers.CharField( help_text='Your new re-indexed index name', allow_blank=False, required=True, validators=[ check_for_wildcards, check_for_colons, check_for_special_symbols, check_for_banned_beginning_chars, check_for_upper_case ]) field_type = serializers.ListField( help_text= f'Used to update the fieldname and the field type of chosen paths.', required=False) add_facts_mapping = serializers.BooleanField( help_text= 'Add texta facts mapping. NB! If texta_facts is present in reindexed fields, the mapping is always created.', required=False, default=False) task = TaskSerializer(read_only=True) fields = serializers.ListField( child=serializers.CharField(), help_text= f'Empty fields chooses all posted indices fields. Fields content adds custom field content to the new index.', required=False) random_size = serializers.IntegerField( help_text= 'Picks a subset of documents of chosen size at random. Disabled by default.', required=False, min_value=1, max_value=10000) class Meta: model = Reindexer fields = ('id', 'url', 'author', 'description', 'indices', 'scroll_size', 'fields', 'query', 'new_index', 'random_size', 'field_type', 'add_facts_mapping', 'task') fields_to_parse = ('fields', 'field_type', 'indices') def validate_new_index(self, value): """ Check that new_index does not exist """ if value in ElasticCore().get_indices(): raise serializers.ValidationError( "new_index already exists, choose a different name for your reindexed index" ) return value def validate_indices(self, value): """ check if re-indexed index is in the relevant project indices field """ project_obj = Project.objects.get( id=self.context['view'].kwargs['project_pk']) for index in value: indices = project_obj.get_indices() if index not in indices: raise serializers.ValidationError( f'Index "{index}" is not contained in your project indices "{indices}"' ) return value def validate_fields(self, value: List[str]): """ check if changed fields included in the request are in the relevant project fields """ project_obj: Project = Project.objects.get( id=self.context['view'].kwargs['project_pk']) indices = self.context["request"].data.get("indices", []) indices = project_obj.get_available_or_all_project_indices(indices) project_fields = ElasticCore().get_fields(indices=indices) field_data = [field["path"] for field in project_fields] for field in value: if field not in field_data: raise serializers.ValidationError( f'The fields you are attempting to re-index are not in current project fields: {project_fields}' ) return value
class EvaluatorSerializer(serializers.ModelSerializer, ProjectResourceUrlSerializer, IndicesSerializerMixin): author = UserSerializer(read_only=True) query = serializers.JSONField(required=False, help_text="Query in JSON format", default=json.dumps(EMPTY_QUERY)) true_fact = serializers.CharField( required=True, help_text=f"Fact name used as true label for mulilabel evaluation.") predicted_fact = serializers.CharField( required=True, help_text= f"Fact name used as predicted label for multilabel evaluation.") true_fact_value = serializers.CharField( required=False, default="", help_text=f"Fact value used as true label for binary evaluation.") predicted_fact_value = serializers.CharField( required=False, default="", help_text=f"Fact value used as predicted label for binary evaluation.") average_function = serializers.ChoiceField( choices=choices.AVG_CHOICES, default=choices.DEFAULT_AVG_FUNCTION, required=False, help_text= f"Sklearn average function. NB! Doesn't have any effect on entity evaluation." ) es_timeout = serializers.IntegerField( default=choices.DEFAULT_ES_TIMEOUT, help_text=f"Elasticsearch scroll timeout in minutes.") scroll_size = serializers.IntegerField( min_value=1, max_value=10000, default=choices.DEFAULT_SCROLL_SIZE, help_text= f"How many documents should be returned by one Elasticsearch scroll.") add_individual_results = serializers.BooleanField( default=choices.DEFAULT_ADD_INDIVIDUAL_RESULTS, required=False, help_text= f"Only used for multilabel/multiclass evaluation. If enabled, individual label scores are calculated and stored as well." ) add_misclassified_examples = serializers.BooleanField( default=choices.DEFAULT_ADD_MISCLASSIFIED_EXAMPLES, required=False, help_text= f"Only used for entity evaluation. If enabled, misclassified and partially overlapping values are stored and can be analyzed later." ) evaluation_type = serializers.ChoiceField( choices=choices.EVALUATION_TYPE_CHOICES, default="multilabel", required=False, help_text=f"Specify the type of labelsets to evaluate.") token_based = serializers.BooleanField( default=choices.DEFAULT_TOKEN_BASED, required=False, help_text= f"If enabled, uses token-based entity evaluation, otherwise calculates the scores based on the spans of two value-sets." ) field = serializers.CharField( default="", required=False, help_text= f"Field related to true and predicted facts. NB! This has effect only for evaluation_type='entity' and is only required if the selected facts have multiple different doc paths." ) plot = serializers.SerializerMethodField() task = TaskSerializer(read_only=True) url = serializers.SerializerMethodField() def validate_indices(self, value): """ Check if indices exist in the relevant project. """ project_obj = Project.objects.get( id=self.context["view"].kwargs["project_pk"]) for index in value: if index.get("name") not in project_obj.get_indices(): raise serializers.ValidationError( f'Index "{index.get("name")}" is not contained in your project indices "{project_obj.get_indices()}"' ) return value def validate(self, data): """ Check if all inserted facts and fact values are present in the indices.""" # For PATCH if len(data) == 1 and "description" in data: return data indices = [index.get("name") for index in data.get("indices")] query = data.get("query") if isinstance(query, str): query = json.loads(query) true_fact = data.get("true_fact") predicted_fact = data.get("predicted_fact") true_fact_value = data.get("true_fact_value") predicted_fact_value = data.get("predicted_fact_value") avg_function = data.get("average_function") evaluation_type = data.get("evaluation_type") doc_path = data.get("field") validate_fact(indices, query, true_fact) validate_fact(indices, query, predicted_fact) validate_fact_value(indices, query, true_fact, true_fact_value) validate_fact_value(indices, query, predicted_fact, predicted_fact_value) if evaluation_type == "entity": validate_entity_facts(indices, query, true_fact, predicted_fact, doc_path) validate_fact_values_in_sync(true_fact_value, predicted_fact_value) validate_average_function(avg_function, true_fact_value, predicted_fact_value) validate_evaluation_type( indices, query, evaluation_type, true_fact, predicted_fact, true_fact_value, predicted_fact_value, ) return data class Meta: model = Evaluator fields = ("url", "author", "id", "description", "indices", "query", "true_fact", "predicted_fact", "true_fact_value", "predicted_fact_value", "average_function", "f1_score", "precision", "recall", "accuracy", "confusion_matrix", "n_true_classes", "n_predicted_classes", "n_total_classes", "evaluation_type", "scroll_size", "es_timeout", "scores_imprecise", "score_after_scroll", "document_count", "add_individual_results", "plot", "task", "add_misclassified_examples", "evaluation_type", "token_based", "field") read_only_fields = ("project", "f1_score", "precision", "recall", "accuracy", "confusion_matrix", "n_true_classes", "n_predicted_classes", "n_total_classes", "document_count", "evaluation_type", "scores_imprecise", "score_after_scroll", "task")
class CRFExtractorSerializer(FieldParseSerializer, serializers.ModelSerializer, IndicesSerializerMixin, ProjectResourceUrlSerializer): description = serializers.CharField(help_text=DESCRIPTION_HELPTEXT) author = UserSerializer(read_only=True) query = serializers.JSONField(help_text=QUERY_HELPTEXT, default=json.dumps(EMPTY_QUERY), required=False) mlp_field = serializers.CharField( help_text='MLP field used to build the model.') labels = serializers.JSONField( default=["GPE", "ORG", "PER", "LOC"], help_text="List of labels used to train the extraction model.") c_values = serializers.JSONField( default=[0.001, 0.1, 0.5], help_text="List of C-values to test during training. Best will be used." ) num_iter = serializers.IntegerField( default=100, help_text="Number of iterations used in training.") test_size = serializers.FloatField( default=0.3, help_text="Proportion of documents reserved for testing the model.") bias = serializers.BooleanField( default=True, help_text="Capture the proportion of a given label in the training set." ) window_size = serializers.IntegerField( default=2, help_text= "Number of words before and after the observed word analyzed.", ) suffix_len = serializers.JSONField( default=json.dumps((2, 2)), help_text= "Number of characters (min, max) used for word suffixes as features.") feature_fields = serializers.MultipleChoiceField( choices=FEATURE_FIELDS_CHOICES, default=DEFAULT_LAYERS, help_text= "Layers (MLP subfields) used as features for the observed word.") context_feature_fields = serializers.MultipleChoiceField( choices=FEATURE_FIELDS_CHOICES, default=DEFAULT_LAYERS, help_text= "Layers (MLP subfields) used as features for the context of the observed word." ) feature_extractors = serializers.MultipleChoiceField( choices=FEATURE_EXTRACTOR_CHOICES, default=DEFAULT_EXTRACTORS, help_text= "Feature extractors used for the observed word and it's context.") context_feature_extractors = serializers.MultipleChoiceField( choices=FEATURE_EXTRACTOR_CHOICES, default=DEFAULT_EXTRACTORS, help_text= "Feature extractors used for the context of the observed word.") embedding = ProjectFilteredPrimaryKeyRelatedField( queryset=Embedding.objects, many=False, read_only=False, allow_null=True, default=None, help_text= "Embedding to use for finding similar words for the observed word and it's context." ) task = TaskSerializer(read_only=True) url = serializers.SerializerMethodField() class Meta: model = CRFExtractor fields = ('id', 'url', 'author', 'description', 'query', 'indices', 'mlp_field', 'window_size', 'test_size', 'num_iter', 'best_c1', 'best_c2', 'bias', 'suffix_len', 'labels', 'feature_fields', 'context_feature_fields', 'feature_extractors', 'context_feature_extractors', 'embedding', 'task', 'precision', 'recall', 'f1_score', 'c_values') read_only_fields = ('precision', 'task', 'recall', 'f1_score', 'best_c1', 'best_c2') fields_to_parse = ('labels', 'suffix_len', 'c_values')
def get(self, request): tasks = Task.objects.all().order_by("pk") serializer = TaskSerializer(data=tasks, many=True) serializer.is_valid() return Response(serializer.data)
class ClusteringSerializer(FieldParseSerializer, serializers.ModelSerializer, IndicesSerializerMixin): author = UserSerializer(read_only=True) description = serializers.CharField() query = serializers.CharField(help_text='Query in JSON format', default=EMPTY_QUERY) num_cluster = serializers.IntegerField( min_value=1, max_value=1000, default=10, help_text='Number of document clusters to be formed.') clustering_algorithm = serializers.ChoiceField( choices=CLUSTERING_ALGORITHMS, default=CLUSTERING_ALGORITHMS[0][0], required=False) fields = serializers.ListField( required=True, help_text='Fields that are used for clustering') display_fields = serializers.ListField( default=[], allow_empty=True, help_text= 'Fields that are used for displaying cluster content. If not specified it is same as "fields".' ) vectorizer = serializers.ChoiceField(choices=VECTORIZERS, default=VECTORIZERS[0][0]) num_dims = serializers.IntegerField(min_value=1, max_value=10000, default=1000, help_text='Size of the dictionary.') use_lsi = serializers.BooleanField( default=False, help_text= 'If set to 1 (true), transforms document-term matrix into lower-dimensional space using LSI. Might and might not improve clustering results.' ) num_topics = serializers.IntegerField( min_value=1, max_value=1000, default=50, help_text= 'Is only used if use_lsi is set to 1. The number of dimension in lower-dimensional space.' ) stop_words = serializers.ListField( default=[], allow_empty=True, help_text= 'List of custom stop words to be removed from documents before clustering.' ) document_limit = serializers.IntegerField( default=100, min_value=1, max_value=10000, help_text='Number of documents retrieved from indices.') ignored_ids = serializers.ListField( default=[], help_text= "List of Elasticsearch document ids to ignore from the clustering process." ) significant_words_filter = serializers.CharField( help_text='Regex to filter out desired words.', default="[0-9]+") url = serializers.SerializerMethodField() task = TaskSerializer(read_only=True) def get_url(self, obj): default_version = REST_FRAMEWORK.get("DEFAULT_VERSION") if default_version == "v1": index = reverse(f"{default_version}:clustering-detail", kwargs={ "project_pk": obj.project.pk, "pk": obj.pk }) elif default_version == "v2": index = reverse(f"{default_version}:topic_analyzer-detail", kwargs={ "project_pk": obj.project.pk, "pk": obj.pk }) if "request" in self.context: request = self.context["request"] url = request.build_absolute_uri(index) return url else: return None def validate_significant_words_filter(self, regex): try: re.compile(regex) except re.error: raise serializers.ValidationError( "Given string is not a valid regex.") return regex class Meta: model = ClusteringResult fields = [ "id", "url", "description", "author", "query", "indices", "num_cluster", "clustering_algorithm", "vectorizer", "num_dims", "use_lsi", "num_topics", "significant_words_filter", "display_fields", "stop_words", "ignored_ids", "fields", "embedding", "document_limit", "task" ] fields_to_parse = ("fields", "query", "display_fields", "ignored_ids", "stop_words")
class IndexSplitterSerializer(FieldParseSerializer, serializers.HyperlinkedModelSerializer, IndicesSerializerMixin, ProjectResourceUrlSerializer): author = UserSerializer(read_only=True) url = serializers.SerializerMethodField() scroll_size = serializers.IntegerField(min_value=0, max_value=10000, required=False) description = serializers.CharField(help_text='Description of the task.', required=True, allow_blank=False) query = serializers.JSONField( help_text= 'Query used to filter the indices. Defaults to an empty query.', required=False) train_index = serializers.CharField(help_text='Name of the train index.', allow_blank=False, required=True, validators=[ check_for_wildcards, check_for_colons, check_for_special_symbols, check_for_banned_beginning_chars, check_for_upper_case ]) test_index = serializers.CharField(help_text='Name of the test index.', allow_blank=False, required=True, validators=[ check_for_wildcards, check_for_colons, check_for_special_symbols, check_for_banned_beginning_chars, check_for_upper_case ]) fields = serializers.ListField( child=serializers.CharField(), help_text=f'Empty fields chooses all posted indices fields.', required=False) task = TaskSerializer(read_only=True) test_size = serializers.IntegerField( help_text= 'Size of the test set. Represents a percentage with "random" or "original" distribution and a quantity with "equal" or "custom" distribution.', required=False, min_value=1, max_value=10000) fact = serializers.CharField( required=False, help_text= "Name of the fact on which the test index distribution will base.") str_val = serializers.CharField( required=False, help_text= "Name of the fact value on which the test index distribution will base." ) distribution = serializers.ChoiceField( choices=LABEL_DISTRIBUTION, default=LABEL_DISTRIBUTION[0][0], required=False, help_text= 'Distribution of the test set. Either "random", "original", "equal" or "custom".' ) custom_distribution = serializers.JSONField( default={}, help_text= "A dictionary containing custom label distribution with keys as labels and values as quantities." ) class Meta: model = IndexSplitter fields = ('id', 'url', 'author', 'description', 'indices', 'scroll_size', 'fields', 'query', 'train_index', 'test_index', "test_size", 'fact', 'str_val', 'distribution', 'custom_distribution', 'task') fields_to_parse = ('fields', 'custom_distribution') def validate_train_index(self, value): """ Check that new_index does not exist """ open_indices, closed_indices = ElasticCore().get_indices() if value in open_indices or value in closed_indices: raise serializers.ValidationError( f"{value} already exists, choose a different name for your train index" ) return value def validate_test_index(self, value): """ Check that new_index does not exist """ open_indices, closed_indices = ElasticCore().get_indices() if value in open_indices or value in closed_indices: raise serializers.ValidationError( f"{value} already exists, choose a different name for your test index" ) return value def validate_indices(self, value): """ check if index is in the relevant project indices field """ project_obj = Project.objects.get( id=self.context['view'].kwargs['project_pk']) for index in value: if index.get("name") not in project_obj.get_indices(): raise serializers.ValidationError( f'Index "{index.get("name")}" is not contained in your project indices "{project_obj.get_indices()}"' ) return value def validate_fields(self, value): ''' check if fields included in the request are in the relevant project fields ''' project_fields = self._get_project_fields() field_data = [field["path"] for field in project_fields] for field in value: if field not in field_data: raise serializers.ValidationError( f'The fields you are attempting to add to new indices are not in current project fields: {project_fields}' ) return value def validate_query(self, value): val = json.loads(value) if "query" not in json.loads(value): raise serializers.ValidationError( "Incorrect elastic query. Must contain field 'query'.") return value def validate(self, data): fact = data.get("fact") if data["distribution"] == "custom" and len( data["custom_distribution"]) == 0: raise serializers.ValidationError( "field custom_distribution can not be empty with custom label distribution" ) if fact == "" and data["distribution"] in [ "custom", "equal", "original" ]: raise serializers.ValidationError( 'fact must be specified with "custom", "equal" or "original" distribution' ) if data["distribution"] in ["custom", "equal", "original"]: if "fields" in data and "texta_facts" not in data["fields"]: project_fields = self._get_project_fields() field_data = [field["path"] for field in project_fields] if "texta_facts" not in field_data: raise serializers.ValidationError( f'Field texta_facts is required but it is not in project fields.: {project_fields}' ) else: data["fields"].append("texta_facts") return data def _get_project_fields(self): project_obj = Project.objects.get( id=self.context['view'].kwargs['project_pk']) project_fields = ElasticCore().get_fields( indices=project_obj.get_indices()) return project_fields
class ApplyLangOnIndicesSerializer(serializers.ModelSerializer, IndicesSerializerMixin, FieldsValidationSerializerMixin): description = serializers.CharField() author = UserSerializer(read_only=True) task = TaskSerializer(read_only=True, required=False) url = serializers.SerializerMethodField() query = serializers.JSONField(help_text='Query in JSON format', required=False, default=json.dumps(EMPTY_QUERY)) field = serializers.CharField(required=True, allow_blank=False) def validate_field(self, value: str): """ Check if selected fields are present in the project and raise error on None if no "fields" field is declared in the serializer, no validation to write custom validation for serializers with FieldParseSerializer, simply override validate_fields in the project serializer """ project_id = self.context['view'].kwargs['project_pk'] project_obj = Project.objects.get(id=project_id) project_fields = set(project_obj.get_elastic_fields(path_list=True)) if not value or not set([value]).issubset(project_fields): raise serializers.ValidationError( f'Entered fields not in current project fields: {project_fields}' ) return value def validate_query(self, query: Union[str, dict]): """ Check if the query is formatted correctly and store it as JSON string, if it is passed as a JSON dict. """ if not isinstance(query, dict): try: query = json.loads(query) except: raise serializers.ValidationError( f"Incorrect query: '{query}'. Query should be formatted as a JSON dict or a JSON string." ) # If loaded query is not JSON dict, raise ValidatioNError if not isinstance(query, dict): raise serializers.ValidationError( f"Incorrect query: '{query}'. Query should contain a JSON dict." ) # Ensure that the query is stored as a JSON string query = json.dumps(query) return query class Meta: model = ApplyLangWorker fields = ("id", "url", "author", "indices", "description", "task", "query", "field") def get_url(self, obj): default_version = "v2" index = reverse(f"{default_version}:lang_index-detail", kwargs={ "project_pk": obj.project.pk, "pk": obj.pk }) if "request" in self.context: request = self.context["request"] url = request.build_absolute_uri(index) return url else: return None def to_representation(self, instance: ApplyLangWorker): data = super(ApplyLangOnIndicesSerializer, self).to_representation(instance) data["query"] = json.loads(instance.query) return data
class TorchTaggerSerializer(FieldParseSerializer, serializers.ModelSerializer, IndicesSerializerMixin, ProjectResourceUrlSerializer): author = UserSerializer(read_only=True) fields = serializers.ListField( child=serializers.CharField(), help_text=f'Fields used to build the model.') query = serializers.JSONField(help_text='Query in JSON format', required=False, default=json.dumps(EMPTY_QUERY)) fact_name = serializers.CharField( default=None, required=False, help_text=f'Fact name used to filter tags (fact values). Default: None' ) pos_label = serializers.CharField( default="", required=False, allow_blank=True, help_text= f'Fact value used as positive label while evaluating the results. This is needed only, if the selected fact has exactly two possible values. Default = ""' ) model_architecture = serializers.ChoiceField(choices=choices.MODEL_CHOICES) maximum_sample_size = serializers.IntegerField( default=choices.DEFAULT_MAX_SAMPLE_SIZE, required=False) minimum_sample_size = serializers.IntegerField( default=choices.DEFAULT_MIN_SAMPLE_SIZE, required=False) num_epochs = serializers.IntegerField(default=choices.DEFAULT_NUM_EPOCHS, required=False) embedding = ProjectFilteredPrimaryKeyRelatedField( queryset=Embedding.objects, many=False, read_only=False, required=True, help_text=f'Embedding to use, usage mandatory.') balance = serializers.BooleanField( default=choices.DEFAULT_BALANCE, required=False, help_text= f'Balance sample sizes of different classes. Only applicable for multiclass taggers. Default = {choices.DEFAULT_BALANCE}' ) use_sentence_shuffle = serializers.BooleanField( default=choices.DEFAULT_USE_SENTENCE_SHUFFLE, required=False, help_text= f'Shuffle sentences in added examples. NB! Only applicable for multiclass taggers with balance=True. Default = {choices.DEFAULT_USE_SENTENCE_SHUFFLE}' ) balance_to_max_limit = serializers.BooleanField( default=choices.DEFAULT_BALANCE_TO_MAX_LIMIT, required=False, help_text= f'If enabled, the number of samples for each class is set to `maximum_sample_size`. Otherwise, it is set to max class size. NB! Only applicable for multiclass taggers with balance == True. Default = {choices.DEFAULT_BALANCE_TO_MAX_LIMIT}' ) task = TaskSerializer(read_only=True) plot = serializers.SerializerMethodField() url = serializers.SerializerMethodField() def validate(self, data): # use custom validation for pos label as some other serializer fields are also required data = validate_pos_label(data) return data class Meta: model = TorchTagger fields = ('url', 'author', 'id', 'description', 'query', 'fields', 'embedding', 'f1_score', 'precision', 'recall', 'accuracy', 'model_architecture', 'maximum_sample_size', 'minimum_sample_size', 'num_epochs', 'plot', 'task', 'fact_name', 'indices', 'confusion_matrix', 'num_examples', 'balance', 'use_sentence_shuffle', 'balance_to_max_limit', 'pos_label', 'classes') read_only_fields = ('project', 'fields', 'f1_score', 'precision', 'recall', 'accuracy', 'plot', 'task', 'confusion_matrix', 'num_examples', 'classes') fields_to_parse = ['fields', 'classes']
class TaggerGroupSerializer(serializers.ModelSerializer, ProjectResourceUrlSerializer): author = UserSerializer(read_only=True) description = serializers.CharField( help_text=f'Description for the Tagger Group.') minimum_sample_size = serializers.IntegerField( default=choices.DEFAULT_MIN_SAMPLE_SIZE, help_text= f'Minimum number of documents required to train a model. Default: {choices.DEFAULT_MIN_SAMPLE_SIZE}' ) fact_name = serializers.CharField( default=choices.DEFAULT_TAGGER_GROUP_FACT_NAME, help_text= f'Fact name used to filter tags (fact values). Default: {choices.DEFAULT_TAGGER_GROUP_FACT_NAME}' ) tagger = TaggerSerializer(write_only=True, remove_fields=[ 'description', 'query', 'fact_name', 'minimum_sample_size' ]) num_tags = serializers.IntegerField(read_only=True) blacklisted_facts = serializers.ListField( child=serializers.CharField(), default=[], help_text="Which fact values to ignore when creating the taggers.") tagger_status = serializers.SerializerMethodField() tagger_statistics = serializers.SerializerMethodField() tagger_params = serializers.SerializerMethodField() url = serializers.SerializerMethodField() task = TaskSerializer(read_only=True) def to_representation(self, instance): data = super(TaggerGroupSerializer, self).to_representation(instance) try: data["blacklisted_facts"] = json.loads(instance.blacklisted_facts) except Exception as e: logging.getLogger(ERROR_LOGGER).exception(e) return data class Meta: model = TaggerGroup fields = ('id', 'url', 'author', 'description', 'fact_name', 'num_tags', 'blacklisted_facts', 'minimum_sample_size', 'tagger_status', 'tagger_params', 'tagger', 'tagger_statistics', 'task') def get_tagger_status(self, obj): tagger_objects = obj.taggers tagger_status = { 'total': obj.num_tags, 'completed': len(tagger_objects.filter(task__status='completed')), 'training': len(tagger_objects.filter(task__status='running')), 'created': len(tagger_objects.filter(task__status='created')), 'failed': len(tagger_objects.filter(task__status='failed')) } return tagger_status def get_tagger_statistics(self, obj): tagger_objects = obj.taggers if tagger_objects.exists(): try: tagger_size_sum = round( tagger_objects.filter(model_size__isnull=False).aggregate( Sum('model_size'))['model_size__sum'], 1) except TypeError as e: # if models are not ready tagger_size_sum = 0 tagger_stats = { 'avg_precision': tagger_objects.aggregate(Avg('precision'))['precision__avg'], 'avg_recall': tagger_objects.aggregate(Avg('recall'))['recall__avg'], 'avg_f1_score': tagger_objects.aggregate(Avg('f1_score'))['f1_score__avg'], 'sum_size': { "size": tagger_size_sum, "unit": "mb" } } return tagger_stats def _embedding_details(self, instance: Tagger): if instance.embedding: return { "id": instance.embedding.pk, "description": instance.embedding.description } else: return None def get_tagger_params(self, obj): if obj.taggers.exists(): first_tagger: Tagger = obj.taggers.first() params = { 'fields': json.loads(first_tagger.fields), 'detect_lang': first_tagger.detect_lang, 'scoring_function': first_tagger.scoring_function, 'maximum_sample_size': first_tagger.maximum_sample_size, 'negative_multiplier': first_tagger.negative_multiplier, 'snowball_language': first_tagger.snowball_language, 'embedding': self._embedding_details(first_tagger), 'indices': first_tagger.get_indices(), 'vectorizer': first_tagger.vectorizer, 'classifier': first_tagger.classifier, 'analyzer': first_tagger.analyzer, 'stop_words': load_stop_words(first_tagger.stop_words), 'ignore_numbers': first_tagger.ignore_numbers, 'balance': first_tagger.balance, 'balance_to_max_limit': first_tagger.balance_to_max_limit } return params
class TaggerSerializer(FieldParseSerializer, serializers.ModelSerializer, IndicesSerializerMixin, ProjectResourceUrlSerializer): author = UserSerializer(read_only=True) description = serializers.CharField( help_text=f'Description for the Tagger. Will be used as tag.') fields = serializers.ListField( child=serializers.CharField(), help_text=f'Fields used to build the model.') vectorizer = serializers.ChoiceField( choices=choices.get_vectorizer_choices(), default=choices.DEFAULT_VECTORIZER, help_text= 'Vectorizer algorithm to create document vectors. NB! HashingVectorizer does not support feature name extraction!' ) analyzer = serializers.ChoiceField( choices=choices.get_analyzer_choices(), default=choices.DEFAULT_ANALYZER, help_text="Analyze text as words or characters.") classifier = serializers.ChoiceField( choices=choices.get_classifier_choices(), default=choices.DEFAULT_CLASSIFIER, help_text='Classification algorithm used in the model.') embedding = ProjectFilteredPrimaryKeyRelatedField( queryset=Embedding.objects, many=False, read_only=False, allow_null=True, default=None, help_text='Embedding to use') negative_multiplier = serializers.FloatField( default=choices.DEFAULT_NEGATIVE_MULTIPLIER, help_text= f'Multiplies the size of positive samples to determine negative example set size. Default: {choices.DEFAULT_NEGATIVE_MULTIPLIER}' ) maximum_sample_size = serializers.IntegerField( default=choices.DEFAULT_MAX_SAMPLE_SIZE, help_text= f'Maximum number of documents used to build a model. Default: {choices.DEFAULT_MAX_SAMPLE_SIZE}' ) minimum_sample_size = serializers.IntegerField( default=choices.DEFAULT_MIN_SAMPLE_SIZE, help_text= f'Minimum number of documents required to train a model. Default: {choices.DEFAULT_MIN_SAMPLE_SIZE}' ) score_threshold = serializers.FloatField( default=choices.DEFAULT_SCORE_THRESHOLD, help_text= f'Elasticsearch score threshold for filtering out irrelevant examples. All examples below first document\'s score * score threshold are ignored. Float between 0 and 1. Default: {choices.DEFAULT_SCORE_THRESHOLD}' ) snowball_language = serializers.ChoiceField( choices=get_snowball_choices(), default=DEFAULT_SNOWBALL_LANGUAGE, help_text= f'Uses Snowball stemmer with specified language to normalize the texts. Default: {DEFAULT_SNOWBALL_LANGUAGE}' ) scoring_function = serializers.ChoiceField( choices=choices.DEFAULT_SCORING_OPTIONS, default=choices.DEFAULT_SCORING_FUNCTION, required=False, help_text= f'Scoring function used while evaluating the results on dev set. Default: {choices.DEFAULT_SCORING_FUNCTION}' ) stop_words = serializers.ListField( child=serializers.CharField(), default=[], required=False, help_text='Stop words to add. Default = [].', write_only=True) ignore_numbers = serializers.BooleanField( default=choices.DEFAULT_IGNORE_NUMBERS, required=False, help_text='If enabled, ignore all numbers as possible features.') detect_lang = serializers.BooleanField( default=False, help_text= "Whether to detect the language for the stemmer from the document itself." ) task = TaskSerializer(read_only=True) plot = serializers.SerializerMethodField() query = serializers.JSONField(help_text='Query in JSON format', required=False, default=json.dumps(EMPTY_QUERY)) fact_name = serializers.CharField( default=None, required=False, help_text=f'Fact name used to filter tags (fact values). Default: None' ) pos_label = serializers.CharField( default="", required=False, allow_blank=True, help_text= f'Fact value used as positive label while evaluating the results. This is needed only, if the selected fact has exactly two possible values. Default = ""' ) url = serializers.SerializerMethodField() tagger_groups = serializers.SerializerMethodField(read_only=True) balance = serializers.BooleanField( default=choices.DEFAULT_BALANCE, required=False, help_text= f'Balance sample sizes of different classes. Only applicable for multiclass taggers. Default = {choices.DEFAULT_BALANCE}' ) balance_to_max_limit = serializers.BooleanField( default=choices.DEFAULT_BALANCE_TO_MAX_LIMIT, required=False, help_text= f'If enabled, the number of samples for each class is set to `maximum_sample_size`. Otherwise, it is set to max class size. NB! Only applicable for multiclass taggers with balance == True. Default = {choices.DEFAULT_BALANCE_TO_MAX_LIMIT}' ) class Meta: model = Tagger fields = ('id', 'url', 'author', 'description', 'query', 'fact_name', 'indices', 'fields', 'detect_lang', 'embedding', 'vectorizer', 'analyzer', 'classifier', 'stop_words', 'maximum_sample_size', 'minimum_sample_size', 'score_threshold', 'negative_multiplier', 'precision', 'recall', 'f1_score', 'snowball_language', 'scoring_function', 'num_features', 'num_examples', 'confusion_matrix', 'plot', 'task', 'tagger_groups', 'ignore_numbers', 'balance', 'balance_to_max_limit', 'pos_label', 'classes') read_only_fields = ('precision', 'recall', 'f1_score', 'num_features', 'num_examples', 'tagger_groups', 'confusion_matrix', 'classes') fields_to_parse = ( 'fields', 'classes', ) def validate(self, data): if data.get("detect_lang", None) is True and data.get( "snowball_language", None): raise ValidationError( "Values 'detect_lang' and 'snowball_language' are mutually exclusive, please opt for one!" ) # use custom validation for pos label as some other serializer fields are also required data = validate_pos_label(data) return data def __init__(self, *args, **kwargs): """ Add the ability to pass extra arguments such as "remove_fields". Useful for the Serializer eg in another Serializer, without making a new one. """ remove_fields = kwargs.pop('remove_fields', None) super(TaggerSerializer, self).__init__(*args, **kwargs) if remove_fields: # for multiple fields in a list for field_name in remove_fields: self.fields.pop(field_name) def get_tagger_groups(self, value: Tagger): return json.loads(value.tagger_groups)