class SimilarityByFeaturesForm(forms.Form): header = 'Identify similar Documents or Text Units by extracted features.' search_similar_documents = checkbox_field("Identify similar Documents.", input_class='max-one-of', initial=True) search_similar_text_units = checkbox_field("Identify similar Text Units.", input_class='max-one-of') similarity_threshold = forms.IntegerField( min_value=50, max_value=100, initial=75, required=True, help_text=_("Min. Similarity Value 50-100%")) use_tfidf = checkbox_field("Use TF-IDF to normalize data") delete = checkbox_field("Delete existing Similarity objects.", initial=True) project = forms.ModelChoiceField( queryset=Project.objects.order_by('-pk'), widget=forms.widgets.Select(attrs={'class': 'chosen'}), required=False, label='Restrict to project') feature_source = forms.MultipleChoiceField( widget=forms.SelectMultiple(attrs={'class': 'chosen'}), choices=[(i, i) for i in DocumentFeatures.source_fields], initial='term', required=True, help_text='Cluster by terms, parties or other fields.') unit_type = forms.ChoiceField(choices=[('sentence', 'sentence'), ('paragraph', 'paragraph')], initial='sentence', required=True) distance_type = forms.ChoiceField(choices=[(i, i) for i in _METRICS], initial='cosine', required=True)
class ChunkSimilarityForm(forms.Form): header = 'Identify similar Documents and/or Text Units.' search_target = LTRRadioField( choices=(('document', 'Identify similar Documents'), ('textunit', 'Identify similar Text Units')), initial='document', required=True) similarity_threshold = forms.IntegerField( min_value=50, max_value=100, initial=75, required=True, help_text=_("Min. Similarity Value 50-100%") ) use_idf = checkbox_field("Use TF-IDF to normalize data", initial=True) ignore_case = checkbox_field("Ignore case", initial=True) delete = checkbox_field("Delete existing Similarity objects.", initial=True) project = forms.ModelChoiceField(queryset=Project.objects.all(), required=False, label='Restrict to project') term_type = LTRRadioField( choices=(('CHAR_NGRAMS', 'Compare text by char ngrams'), ('WORDS', 'Compare text by words'), ('WORD_3GRAMS', 'Compare texts by word 3-grams')), initial='WORDS', required=True) ngram_len = forms.IntegerField( min_value=3, max_value=20, initial=6, required=True, help_text='ngram length when using char ngrams')
class PartySimilarityForm(forms.Form): header = 'Identify similar Parties.' case_sensitive = checkbox_field('Case Sensitive', initial=True) similarity_type = forms.ChoiceField( choices=[('token_set_ratio', 'token_set_ratio'), ('token_sort_ratio', 'token_sort_ratio')], required=True, initial='token_set_ratio') similarity_threshold = forms.IntegerField( min_value=0, max_value=100, initial=90, required=True, help_text=_("Min. Similarity Value 0-100%.")) delete = checkbox_field("Delete existing PartySimilarity objects.", initial=True)
class LocateEmployeesForm(forms.Form): header = 'Locate Employees in existing documents.' no_detect = checkbox_field("All documents in DB are employment agreements (disable detection)") delete = checkbox_field( label=_("Delete existing Employees"), initial=True) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.fields['document_type'] = forms.MultipleChoiceField( choices=[(t, t) for t in Document.objects .order_by().values_list('document_type', flat=True).distinct()], widget=forms.SelectMultiple(attrs={'class': 'chosen'}), required=False) self.fields = OrderedDict((k, self.fields[k]) for k in ['document_type', 'no_detect', 'delete'])
class BaseRunClassifierForm(forms.Form): min_confidence = forms.IntegerField( min_value=0, max_value=100, initial=90, required=True, help_text='Store values with confidence greater than (%).') delete_suggestions = checkbox_field( "Delete ClassifierSuggestions of Classifier specified above.") project = forms.ModelMultipleChoiceField( queryset=Project.objects.all().order_by('-pk'), widget=forms.SelectMultiple(attrs={'class': 'chosen compact'}), required=False, label='Restrict to project') project_name_filter = forms.CharField( max_length=100, required=False, help_text= 'Project name filter as an alternative to "Project" option. Use "%" as a wildcard symbol.' ) field_order = [ 'classifier', 'project', 'project_name_filter', 'min_confidence', 'delete_suggestions' ]
class SimilarityForm(forms.Form): header = 'Identify similar Documents and/or Text Units.' search_similar_documents = checkbox_field("Identify similar Documents.", input_class='min-one-of', initial=True) search_similar_text_units = checkbox_field("Identify similar Text Units.", input_class='min-one-of') similarity_threshold = forms.IntegerField( min_value=50, max_value=100, initial=75, required=True, help_text=_("Min. Similarity Value 50-100%")) use_idf = checkbox_field("Use TF-IDF to normalize data") delete = checkbox_field("Delete existing Similarity objects.", initial=True)
def child_field(delete_tip=None, label='Delete existing usages', child_class='checkbox-child'): if delete_tip: label = "Delete existing %s Usages" % delete_tip return checkbox_field(label, input_class=child_class, label_class='checkbox-small level-1')
class LoadDocumentsForm(forms.Form): header = 'Parse documents to create Documents and Text Units.' project = forms.ModelChoiceField(queryset=Project.objects.order_by('-pk'), required=False) source_data = forms.CharField( max_length=1000, required=True, help_text=''' Path to a folder with uploaded files relative to "/media/%s". For example, "new" or "/".<br /> Create new folders and upload new documents if needed. ''' % settings.FILEBROWSER_DOCUMENTS_DIRECTORY) source_type = forms.CharField( max_length=100, required=False) document_type = forms.ModelChoiceField(queryset=DocumentType.objects.all(), required=False) detect_contract = checkbox_field("Detect if a document is contract", initial=True) delete = checkbox_field("Delete existing Documents") run_standard_locators = checkbox_field("Run Standard Locators", initial=False)
class CleanProjectForm(forms.Form): header = 'Clean Project (delete project content or project itself as well.' _project = forms.ModelChoiceField(queryset=Project.objects.all(), required=True) delete = checkbox_field("Delete Project itself as well.", initial=True) def clean(self): cleaned_data = super().clean() cleaned_data['_project_id'] = cleaned_data['_project'].pk del cleaned_data['_project']
class ProcessLeaseDocumentsForm(forms.Form): header = 'Detect and Process Lease Documents' no_detect = checkbox_field("All documents in DB are lease agreements (disable detection)") delete = checkbox_field("Delete existing lease documents data") def _post_clean(self): super()._post_clean() self.cleaned_data['module_name'] = MODULE_NAME def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.fields['document_type'] = forms.MultipleChoiceField( choices=[(t, t) for t in Document.objects .order_by().values_list('document_type', flat=True).distinct()], widget=forms.SelectMultiple(attrs={'class': 'chosen'}), required=False) self.fields = OrderedDict((k, self.fields[k]) for k in ['document_type', 'no_detect', 'delete'])
class SimilarityForm(forms.Form): header = 'Identify similar Documents and/or Text Units.' search_similar_documents = checkbox_field("Identify similar Documents.", input_class='min-one-of', initial=True) search_similar_text_units = checkbox_field("Identify similar Text Units.", input_class='min-one-of') similarity_threshold = forms.IntegerField( min_value=50, max_value=100, initial=75, required=True, help_text=_("Min. Similarity Value 50-100%")) use_idf = checkbox_field("Use TF-IDF to normalize data") delete = checkbox_field("Delete existing Similarity objects.", initial=True) project = forms.ModelChoiceField( queryset=Project.objects.order_by('-pk'), widget=forms.widgets.Select(attrs={'class': 'chosen'}), required=False, label='Restrict to project')
class LoadDocumentsForm(forms.Form): header = 'Parse documents to create Documents and Text Units.' source_path = forms.CharField(max_length=1000, required=True, help_text=''' Relative path to a folder with uploaded files.\n You can choose any folder or file in "/media/%s" folder.\n For example, "new" or "/".\n Create new folders and upload new documents if needed.''' % settings.FILEBROWSER_DIRECTORY) source_type = forms.CharField(max_length=100, required=True) document_type = forms.ModelChoiceField(queryset=DocumentType.objects.all(), required=False) delete = checkbox_field("Delete existing Documents")
class BaseRunClassifierForm(forms.Form): min_confidence = forms.IntegerField( min_value=0, max_value=100, initial=90, required=True, help_text='Store values with confidence greater than (%).') delete_suggestions = checkbox_field( "Delete ClassifierSuggestions of Classifier specified above.") project = forms.ModelChoiceField( queryset=Project.objects.order_by('-pk'), widget=forms.widgets.Select(attrs={'class': 'chosen'}), required=True, label='Restrict to project') field_order = [ 'classifier', 'project', 'min_confidence', 'delete_suggestions' ]
class ExistedClassifierClassifyForm(forms.Form): header = 'Classify Text Units using an existing Classifier.' classifier = forms.ChoiceField( choices=[(c.pk, c.name) for c in TextUnitClassifier.objects.filter(is_active=True)], widget=forms.widgets.Select(attrs={'class': 'chosen'}), required=True) sample_size = forms.IntegerField( min_value=1, required=False, help_text='Number of Documents to process. Leave blank to process all Documents.') min_confidence = forms.IntegerField( min_value=0, max_value=100, initial=90, required=True, help_text='Store values with confidence greater than (%).') delete_suggestions = checkbox_field( "Delete ClassifierSuggestions of Classifier specified above.")
class LocateForm(forms.Form): header = 'Locate specific terms in existing text units.' locate_all = checkbox_field( label="Locate all items / Reverse choice", label_class='main-label') geoentity_locate = locate_field("Geo Entities and Geo Aliases", parent_class='') geoentity_priority = child_field( label="Use first entity occurrence to resolve ambiguous entities", child_class='') geoentity_delete = child_field( label="Delete existing Geo Entity Usages and Geo Alias Usages", child_class='') date_locate = locate_field(label='Dates', parent_class='') date_strict = child_field(label="Strict", child_class='') date_delete = child_field("Date", child_class='') amount_locate = locate_field('Amounts') amount_delete = child_field("Amount") citation_locate = locate_field("Citations") citation_delete = child_field("Citation") copyright_locate = locate_field("Copyrights") copyright_delete = child_field("Copyright") court_locate = locate_field('Courts') court_delete = child_field('Court') currency_locate = locate_field('Currencies') currency_delete = child_field('Currency') duration_locate = locate_field('Date Durations') duration_delete = child_field('Date Duration') definition_locate = locate_field('Definitions') definition_delete = child_field('Definition') distance_locate = locate_field('Distances') distance_delete = child_field('Distance') party_locate = locate_field('Parties') party_delete = child_field('Parties and Party Usages') percent_locate = locate_field('Percents') percent_delete = child_field('Percent') ratio_locate = locate_field('Ratios') ratio_delete = child_field('Ratio') regulation_locate = locate_field('Regulations') regulation_delete = child_field('Regulation') term_locate = locate_field('Terms') term_delete = child_field('Term') trademark_locate = locate_field('Trademarks') trademark_delete = child_field('Trademark') url_locate = locate_field('Urls') url_delete = child_field('Url') parse = LTRRadioField( choices=(('paragraphs', 'Parse Text Units with "paragraph" type'), ('sentences', 'Parse Text Units with both "paragraph" and "sentence" types')), help_text='Warning! Parsing both "paragraph" and "sentence" Text Unit types' ' will take much more time', initial='paragraphs', required=False) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) for field in list(self.fields.keys()): if field in ['parse', 'locate_all']: continue field_name = field.split('_')[0] available_locators = list(settings.REQUIRED_LOCATORS) + list(config.standard_optional_locators) if field_name not in available_locators: del self.fields[field]
class ClusterForm(forms.Form): header = 'Clustering Documents and/or Text Units by Terms, Entities or Parties.' do_cluster_documents = checkbox_field("Cluster Documents", initial=True, input_class='max-one-of') do_cluster_text_units = checkbox_field("Cluster Text Units", input_class='max-one-of') project = forms.ModelChoiceField( queryset=Project.objects.order_by('-pk'), widget=forms.widgets.Select(attrs={'class': 'chosen'}), required=True, label='Restrict to project') cluster_by = forms.MultipleChoiceField( widget=forms.SelectMultiple(attrs={'class': 'chosen'}), choices=[(i, i) for i in DocumentFeatures.source_fields], initial='term', required=True, help_text='Cluster by terms, parties or other fields.') using = forms.ChoiceField( label='Algorithm', choices=[ ('minibatchkmeans', 'MiniBatchKMeans'), ('kmeans', 'KMeans'), ('birch', 'Birch'), ('dbscan', 'DBSCAN'), # ('LabelSpreading', 'LabelSpreading') ], required=True, initial='minidatchkmeans', help_text='Clustering algorithm model name.') n_clusters = forms.IntegerField(label='n_clusters', min_value=1, initial=3, required=True, help_text='Number of clusters.') name = forms.CharField(max_length=100, required=True) description = forms.CharField(max_length=200, required=False) options = forms.BooleanField(**options_field_kwargs) kmeans_max_iter = forms.IntegerField( label='max_iter', min_value=1, initial=100, required=True, help_text='Maximum number of iterations for a single run.') kmeans_n_init = forms.IntegerField( label='n_init', min_value=1, initial=10, required=True, help_text= 'Number of time the k-means algorithm will be run with different centroid seeds. ' 'The final results will be the best output of n_init consecutive runs in ' 'terms of inertia.') minibatchkmeans_batch_size = forms.IntegerField( label='batch_size', min_value=1, initial=100, required=True, help_text='Size of the mini batches.') birch_threshold = forms.FloatField( label='threshold', min_value=0, initial=0.5, required=True, help_text= 'The radius of the subcluster obtained by merging a new sample and the closest ' 'subcluster should be lesser than the threshold.' ' Otherwise a new subcluster is started.') birch_branching_factor = forms.IntegerField( label='branching_factor', min_value=1, initial=50, required=True, help_text='Maximum number of CF subclusters in each node.') dbscan_eps = forms.FloatField( label='eps', min_value=0, initial=0.5, required=True, help_text= 'The maximum distance between two samples for them to be considered ' 'as in the same neighborhood.') dbscan_leaf_size = forms.IntegerField( label='leaf_size', min_value=1, initial=30, required=True, help_text='Leaf size passed to BallTree or cKDTree. ' 'This can affect the speed of the construction and query, ' 'as well as the memory required to store the tree.') dbscan_p = forms.FloatField( label='p', min_value=0, required=False, help_text='Leaf size passed to BallTree or cKDTree. ' 'This can affect the speed of the construction and query, ' 'as well as the memory required to store the tree.') # ls_documents_property = forms.Field() # ls_text_units_property = forms.Field() # ls_max_iter = forms.IntegerField( # label='max_iter', # min_value=1, # initial=5, # required=True, # help_text='Maximum number of iterations allowed.') # delete_type = checkbox_field( # 'Delete existed Clusters of the "Cluster By" and "Algorithm" specified above', # input_class='max-one-of') # delete = checkbox_field("Delete all existed Clusters", input_class='max-one-of') # def __init__(self, *args, **kwargs): # super().__init__(*args, **kwargs) # if TextUnitProperty.objects.exists(): # choices = [(p, p) for p in sorted( # set(TextUnitProperty.objects.values_list('key', flat=True)), # key=lambda i: i.lower())] # self.fields['ls_text_units_property'] = forms.ChoiceField( # label='Text Unit Property Name', # widget=forms.widgets.Select(attrs={'class': 'chosen'}), # choices=choices, # required=True, # initial=choices[0][0]) # else: # del self.fields['ls_text_units_property'] # if DocumentProperty.objects.exists(): # choices = [(p, p) for p in sorted( # set(DocumentProperty.objects.values_list('key', flat=True)), # key=lambda i: i.lower())] # self.fields['ls_documents_property'] = forms.ChoiceField( # label='Document Property Name', # widget=forms.widgets.Select(attrs={'class': 'chosen'}), # choices=choices, # required=True, # initial=choices[0][0]) # else: # del self.fields['ls_documents_property'] # if not DocumentProperty.objects.exists() and not TextUnitProperty.objects.exists(): # self.fields['using'].choices = self.fields['using'].choices[:-1] def clean(self): cleaned_data = super().clean() do_cluster_documents = cleaned_data.get("do_cluster_documents") do_cluster_text_units = cleaned_data.get("do_cluster_text_units") if not any([do_cluster_documents, do_cluster_text_units]): self.add_error('do_cluster_documents', 'Please choose either Documents or Text Units') self.add_error('do_cluster_text_units', 'Please choose either Documents or Text Units')
class BaseTrainClassifierForm(forms.Form): options = forms.BooleanField(**options_field_kwargs) svc_C = forms.FloatField( label='C', min_value=0, initial=1.0, required=True, help_text='Penalty parameter C of the error term.') svc_kernel = forms.ChoiceField( label='kernel', choices=[('rbf', 'rbf'), ('linear', 'linear'), ('poly', 'poly'), ('sigmoid', 'sigmoid'), ('precomputed', 'precomputed')], required=True, initial='rbf', help_text='Specifies the kernel type to be used in the algorithm.') svc_gamma = forms.CharField( label='gamma', max_length=6, initial='scale', required=False, help_text="{'scale', 'auto'} or float, optional (default='scale')." " Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.") mnb_alpha = forms.FloatField( label='alpha', min_value=0, initial=1.0, required=True, help_text= 'Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).' ) rfc_etc_n_estimators = forms.IntegerField( label='n_estimators', min_value=1, initial=10, required=True, help_text='The number of trees in the forest.') rfc_etc_criterion = forms.ChoiceField( label='criterion', choices=[('gini', 'gini'), ('entropy', 'entropy')], required=True, initial='gini', help_text='The function to measure the quality of a split.') rfc_etc_max_features = forms.IntegerField( label='max_features', min_value=1, required=False, help_text= 'The number of features to consider when looking for the best split.' ' Integer or blank for "auto".') rfc_etc_max_depth = forms.IntegerField( label='max_depth', min_value=1, required=False, help_text='The maximum depth of the tree.' ' If None, then nodes are expanded until all leaves are pure' ' or until all leaves contain less than min_samples_split samples.') rfc_etc_min_samples_split = forms.IntegerField( label='min_samples_split', min_value=1, initial=2, required=True, help_text= 'The minimum number of samples required to split an internal node.') rfc_etc_min_samples_leaf = forms.IntegerField( label='min_samples_leaf', min_value=1, initial=1, required=True, help_text='The minimum number of samples required to be at a leaf node.' ) lrcv_Cs = forms.IntegerField( label='Cs', min_value=1, initial=10, required=True, help_text= 'Each of the values in Cs describes the inverse of regularization strength.' ) lrcv_fit_intercept = forms.BooleanField( label='fit_intercept', required=False, help_text='Specifies if a constant (a.k.a. bias or intercept)' ' should be added to the decision function.') lrcv_multi_class = forms.ChoiceField( label='multi_class', choices=[('ovr', 'ovr'), ('multinomial', 'multinomial')], required=True, initial='ovr', help_text= 'If the option chosen is ‘ovr’, then a binary problem is fit for each label. ' 'Else the loss minimised is the multinomial loss fit across the ' 'entire probability distribution. ' 'Works only for the ‘newton-cg’, ‘sag’ and ‘lbfgs’ solver.') lrcv_solver = forms.ChoiceField( label='solver', choices=[('lbfgs', 'lbfgs'), ('newton-cg', 'newton-cg'), ('liblinear', 'liblinear'), ('sag', 'sag')], required=True, initial='lbfgs', help_text='Algorithm to use in the optimization problem.') class_name = forms.ChoiceField(choices=[], required=True, help_text='Classifier class name') classifier_name = forms.CharField(max_length=100, required=True, help_text='Classifier name') use_tfidf = checkbox_field("Use TF-IDF to normalize data") delete_classifier = checkbox_field( "Delete existing Classifiers of class name specified above.") project = forms.ModelMultipleChoiceField( queryset=Project.objects.all().order_by('-pk'), widget=forms.SelectMultiple(attrs={'class': 'chosen compact'}), required=False, label='Restrict to project') project_name_filter = forms.CharField( max_length=100, required=False, help_text= 'Project name filter as an alternative to "Project" option. Use "%" as a wildcard symbol.' ) classify_by = forms.MultipleChoiceField( widget=forms.SelectMultiple(attrs={'class': 'chosen'}), choices=[(i, i) for i in DocumentFeatures.source_fields], initial='term', required=True, help_text='Classify by terms, parties or other fields.') algorithm = forms.ChoiceField(choices=CLASSIFIER_NAME_CHOICES, required=True, initial='RandomForestClassifier', help_text='Classifier algorithm name.') metric_pos_label = forms.CharField( max_length=100, required=False, help_text= 'Positive label for "f1", "precision", "recall" accuracy metrics.') unit_type = forms.ChoiceField(choices=[('sentence', 'sentence'), ('paragraph', 'paragraph')], required=True, initial='sentence', help_text='Text Unit type.') field_order = [ 'project', 'project_name_filter', 'unit_type', 'algorithm', 'use_tfidf', 'class_name', 'classify_by', 'classifier_name', 'metric_pos_label', 'delete_classifier' ] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.fields['class_name'] = forms.ChoiceField( choices=[(class_name, class_name) for class_name in set( self.classification_db_model.objects.values_list('class_name', flat=True))], required=True, help_text='Classification class name') def clean_svc_gamma(self): svc_gamma = self.cleaned_data['svc_gamma'] try: svc_gamma = float(svc_gamma) except ValueError: pass return svc_gamma
class LocateForm(forms.Form): header = 'Locate specific terms in existing text units.' locate_all = checkbox_field(label="Locate all items / Reverse choice", label_class='main-label') geoentity_locate = locate_field("Geo Entities and Geo Aliases", parent_class='') geoentity_priority = child_field( label="Use first entity occurrence to resolve ambiguous entities", child_class='') geoentity_delete = child_field( label="Delete existing Geo Entity Usages and Geo Alias Usages", child_class='') date_locate = locate_field(label='Dates', parent_class='') date_strict = child_field(label="Strict", child_class='') date_delete = child_field("Date", child_class='') amount_locate = locate_field('Amounts') amount_delete = child_field("Amount") citation_locate = locate_field("Citations") citation_delete = child_field("Citation") copyright_locate = locate_field("Copyrights") copyright_delete = child_field("Copyright") court_locate = locate_field('Courts') court_delete = child_field('Court') currency_locate = locate_field('Currencies') currency_delete = child_field('Currency') duration_locate = locate_field('Date Durations') duration_delete = child_field('Date Duration') definition_locate = locate_field('Definitions') definition_delete = child_field('Definition') distance_locate = locate_field('Distances') distance_delete = child_field('Distance') party_locate = locate_field('Parties') party_delete = child_field('Parties and Party Usages') percent_locate = locate_field('Percents') percent_delete = child_field('Percent') ratio_locate = locate_field('Ratios') ratio_delete = child_field('Ratio') regulation_locate = locate_field('Regulations') regulation_delete = child_field('Regulation') term_locate = locate_field('Terms') term_delete = child_field('Term') trademark_locate = locate_field('Trademarks') trademark_delete = child_field('Trademark') url_locate = locate_field('Urls') url_delete = child_field('Url') parse = forms.MultipleChoiceField(widget=LTRCheckgroupWidget(), choices=(('sentence', 'Find in sentences'), ('paragraph', 'Find in paragraphs')), label="Text units where to find terms") ''' parse = LTRRadioField( choices=(('sentence', 'Parse Text Units with "sentence" types'), ('paragraph', 'Parse Text Units with "paragraph" type')), initial='sentence', required=False) ''' project = forms.ModelChoiceField(queryset=Project.objects.order_by('-pk'), required=False) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) from apps.extract.app_vars import STANDARD_LOCATORS, OPTIONAL_LOCATORS available_locators = set(STANDARD_LOCATORS.val) | set( OPTIONAL_LOCATORS.val) for field in list(self.fields.keys()): if field in ['parse', 'locate_all', 'project']: continue field_name = field.split('_')[0] if field_name not in available_locators: del self.fields[field] def is_valid(self): is_form_valid = super(LocateForm, self).is_valid() # check at least one "locate" choice is selected has_locate_chosen = bool([ 1 for k, v in self.cleaned_data.items() if 'locate' in k and v is True ]) if has_locate_chosen is False: self.add_error('locate_all', 'Please choose a locator.') if not is_form_valid: return False # check at least one "parse" choice is selected if 'parse' not in self.cleaned_data or not self.cleaned_data['parse']: return False return True
class CreateClassifierClassifyForm(forms.Form): header = 'Classify Text Units by creating a new Classifier.' CLASSIFIER_NAME_CHOICES = ( ('LogisticRegressionCV', 'LogisticRegressionCV'), ('MultinomialNB', 'MultinomialNB'), ('ExtraTreesClassifier', 'ExtraTreesClassifier'), ('RandomForestClassifier', 'RandomForestClassifier'), ('SVC', 'SVC'), ) classify_by = forms.ChoiceField( choices=[('terms', 'Terms'), ('parties', 'Parties'), ('entities', 'Geo Entities')], required=True, help_text='Classify using terms, parties or geo entities.') algorithm = forms.ChoiceField( choices=CLASSIFIER_NAME_CHOICES, required=True, initial='LogisticRegressionCV', help_text='Text Unit Classifier name') class_name = forms.ChoiceField( choices=[(class_name, class_name) for class_name in set(TextUnitClassification.objects.values_list('class_name', flat=True))], required=True, help_text='Text Unit class name') sample_size = forms.IntegerField( min_value=1, required=False, help_text='Number of Documents to process. Leave blank to process all Documents.') min_confidence = forms.IntegerField( min_value=0, max_value=100, initial=90, required=True, help_text='Store values with confidence greater than (%).') options = forms.BooleanField(**options_field_kwargs) svc_c = forms.FloatField( label='C', min_value=0, initial=1.0, required=True, help_text='Penalty parameter C of the error term.') svc_kernel = forms.ChoiceField( label='kernel', choices=[('rbf', 'rbf'), ('linear', 'linear'), ('poly', 'poly'), ('sigmoid', 'sigmoid'), ('precomputed', 'precomputed')], required=True, initial='rbf', help_text='Specifies the kernel type to be used in the algorithm.') svc_gamma = forms.FloatField( label='gamma', min_value=0, required=False, help_text='Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. ' 'If gamma is ‘auto’ then 1/n_features will be used instead.') mnb_alpha = forms.FloatField( label='alpha', min_value=0, initial=1.0, required=True, help_text='Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).') rfc_etc_n_estimators = forms.IntegerField( label='n_estimators', min_value=1, initial=10, required=True, help_text='The number of trees in the forest.') rfc_etc_criterion = forms.ChoiceField( label='criterion', choices=[('gini', 'gini'), ('entropy', 'entropy')], required=True, initial='gini', help_text='The function to measure the quality of a split.') rfc_etc_max_features = forms.IntegerField( label='max_features', min_value=1, required=False, help_text='The number of features to consider when looking for the best split.' ' Integer or blank for "auto".') rfc_etc_max_depth = forms.IntegerField( label='max_depth', min_value=1, required=False, help_text='The maximum depth of the tree.' ' If None, then nodes are expanded until all leaves are pure' ' or until all leaves contain less than min_samples_split samples.') rfc_etc_min_samples_split = forms.IntegerField( label='min_samples_split', min_value=1, initial=2, required=True, help_text='The minimum number of samples required to split an internal node.') rfc_etc_min_samples_leaf = forms.IntegerField( label='min_samples_leaf', min_value=1, initial=1, required=True, help_text='The minimum number of samples required to be at a leaf node.') lrcv_cs = forms.IntegerField( label='cs', min_value=1, initial=10, required=True, help_text='Each of the values in Cs describes the inverse of regularization strength.') lrcv_fit_intercept = forms.BooleanField( label='fit_intercept', required=False, help_text='Specifies if a constant (a.k.a. bias or intercept)' ' should be added to the decision function.') lrcv_multi_class = forms.ChoiceField( label='multi_class', choices=[('ovr', 'ovr'), ('multinomial', 'multinomial')], required=True, initial='ovr', help_text='If the option chosen is ‘ovr’, then a binary problem is fit for each label. ' 'Else the loss minimised is the multinomial loss fit across the ' 'entire probability distribution. ' 'Works only for the ‘newton-cg’, ‘sag’ and ‘lbfgs’ solver.') lrcv_solver = forms.ChoiceField( label='solver', choices=[('lbfgs', 'lbfgs'), ('newton-cg', 'newton-cg'), ('liblinear', 'liblinear'), ('sag', 'sag')], required=True, initial='lbfgs', help_text='Algorithm to use in the optimization problem.') use_tfidf = checkbox_field( "Use TF-IDF to normalize data") delete_classifier = checkbox_field( "Delete existing Classifiers of class name specified above.") delete_suggestions = checkbox_field( "Delete ClassifierSuggestions of class name specified above.") def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.fields['class_name'] = forms.ChoiceField( choices=[(class_name, class_name) for class_name in set(TextUnitClassification.objects.values_list('class_name', flat=True))], required=True, help_text='Text Unit class name')
def locate_field(label, parent_class='checkbox-parent'): return checkbox_field(label, input_class=parent_class)
class LocateTermsForm(forms.Form): header = 'Locate Terms in existing Text Units.' delete = checkbox_field("Delete existing Term Usages", initial=True)