Ejemplo n.º 1
0
class SimilarityByFeaturesForm(forms.Form):
    header = 'Identify similar Documents or Text Units by extracted features.'
    search_similar_documents = checkbox_field("Identify similar Documents.",
                                              input_class='max-one-of',
                                              initial=True)
    search_similar_text_units = checkbox_field("Identify similar Text Units.",
                                               input_class='max-one-of')
    similarity_threshold = forms.IntegerField(
        min_value=50,
        max_value=100,
        initial=75,
        required=True,
        help_text=_("Min. Similarity Value 50-100%"))
    use_tfidf = checkbox_field("Use TF-IDF to normalize data")
    delete = checkbox_field("Delete existing Similarity objects.",
                            initial=True)
    project = forms.ModelChoiceField(
        queryset=Project.objects.order_by('-pk'),
        widget=forms.widgets.Select(attrs={'class': 'chosen'}),
        required=False,
        label='Restrict to project')
    feature_source = forms.MultipleChoiceField(
        widget=forms.SelectMultiple(attrs={'class': 'chosen'}),
        choices=[(i, i) for i in DocumentFeatures.source_fields],
        initial='term',
        required=True,
        help_text='Cluster by terms, parties or other fields.')
    unit_type = forms.ChoiceField(choices=[('sentence', 'sentence'),
                                           ('paragraph', 'paragraph')],
                                  initial='sentence',
                                  required=True)
    distance_type = forms.ChoiceField(choices=[(i, i) for i in _METRICS],
                                      initial='cosine',
                                      required=True)
Ejemplo n.º 2
0
class ChunkSimilarityForm(forms.Form):
    header = 'Identify similar Documents and/or Text Units.'
    search_target = LTRRadioField(
        choices=(('document', 'Identify similar Documents'),
                 ('textunit', 'Identify similar Text Units')),
        initial='document',
        required=True)

    similarity_threshold = forms.IntegerField(
        min_value=50,
        max_value=100,
        initial=75,
        required=True,
        help_text=_("Min. Similarity Value 50-100%")
    )
    use_idf = checkbox_field("Use TF-IDF to normalize data", initial=True)
    ignore_case = checkbox_field("Ignore case", initial=True)
    delete = checkbox_field("Delete existing Similarity objects.", initial=True)
    project = forms.ModelChoiceField(queryset=Project.objects.all(),
                                     required=False, label='Restrict to project')
    term_type = LTRRadioField(
        choices=(('CHAR_NGRAMS', 'Compare text by char ngrams'),
                 ('WORDS', 'Compare text by words'),
                 ('WORD_3GRAMS', 'Compare texts by word 3-grams')),
        initial='WORDS',
        required=True)
    ngram_len = forms.IntegerField(
        min_value=3,
        max_value=20,
        initial=6,
        required=True,
        help_text='ngram length when using char ngrams')
Ejemplo n.º 3
0
class PartySimilarityForm(forms.Form):
    header = 'Identify similar Parties.'
    case_sensitive = checkbox_field('Case Sensitive', initial=True)
    similarity_type = forms.ChoiceField(
        choices=[('token_set_ratio', 'token_set_ratio'),
                 ('token_sort_ratio', 'token_sort_ratio')],
        required=True,
        initial='token_set_ratio')
    similarity_threshold = forms.IntegerField(
        min_value=0,
        max_value=100,
        initial=90,
        required=True,
        help_text=_("Min. Similarity Value 0-100%."))
    delete = checkbox_field("Delete existing PartySimilarity objects.", initial=True)
Ejemplo n.º 4
0
class LocateEmployeesForm(forms.Form):
    header = 'Locate Employees in existing documents.'
    no_detect = checkbox_field("All documents in DB are employment agreements (disable detection)")
    delete = checkbox_field(
        label=_("Delete existing Employees"),
        initial=True)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.fields['document_type'] = forms.MultipleChoiceField(
            choices=[(t, t) for t in Document.objects
                .order_by().values_list('document_type', flat=True).distinct()],
            widget=forms.SelectMultiple(attrs={'class': 'chosen'}),
            required=False)
        self.fields = OrderedDict((k, self.fields[k]) for k in ['document_type', 'no_detect', 'delete'])
Ejemplo n.º 5
0
class BaseRunClassifierForm(forms.Form):
    min_confidence = forms.IntegerField(
        min_value=0,
        max_value=100,
        initial=90,
        required=True,
        help_text='Store values with confidence greater than (%).')
    delete_suggestions = checkbox_field(
        "Delete ClassifierSuggestions of Classifier specified above.")
    project = forms.ModelMultipleChoiceField(
        queryset=Project.objects.all().order_by('-pk'),
        widget=forms.SelectMultiple(attrs={'class': 'chosen compact'}),
        required=False,
        label='Restrict to project')
    project_name_filter = forms.CharField(
        max_length=100,
        required=False,
        help_text=
        'Project name filter as an alternative to "Project" option. Use "%" as a wildcard symbol.'
    )

    field_order = [
        'classifier', 'project', 'project_name_filter', 'min_confidence',
        'delete_suggestions'
    ]
Ejemplo n.º 6
0
class SimilarityForm(forms.Form):
    header = 'Identify similar Documents and/or Text Units.'
    search_similar_documents = checkbox_field("Identify similar Documents.",
                                              input_class='min-one-of',
                                              initial=True)
    search_similar_text_units = checkbox_field("Identify similar Text Units.",
                                               input_class='min-one-of')
    similarity_threshold = forms.IntegerField(
        min_value=50,
        max_value=100,
        initial=75,
        required=True,
        help_text=_("Min. Similarity Value 50-100%"))
    use_idf = checkbox_field("Use TF-IDF to normalize data")
    delete = checkbox_field("Delete existing Similarity objects.",
                            initial=True)
Ejemplo n.º 7
0
def child_field(delete_tip=None,
                label='Delete existing usages',
                child_class='checkbox-child'):
    if delete_tip:
        label = "Delete existing %s Usages" % delete_tip
    return checkbox_field(label,
                          input_class=child_class,
                          label_class='checkbox-small level-1')
Ejemplo n.º 8
0
class LoadDocumentsForm(forms.Form):
    header = 'Parse documents to create Documents and Text Units.'
    project = forms.ModelChoiceField(queryset=Project.objects.order_by('-pk'), required=False)
    source_data = forms.CharField(
        max_length=1000,
        required=True,
        help_text='''
        Path to a folder with uploaded files relative to "/media/%s". For example, "new" or "/".<br />
        Create new folders and upload new documents if needed.
        ''' % settings.FILEBROWSER_DOCUMENTS_DIRECTORY)
    source_type = forms.CharField(
        max_length=100,
        required=False)
    document_type = forms.ModelChoiceField(queryset=DocumentType.objects.all(), required=False)
    detect_contract = checkbox_field("Detect if a document is contract", initial=True)
    delete = checkbox_field("Delete existing Documents")
    run_standard_locators = checkbox_field("Run Standard Locators", initial=False)
class CleanProjectForm(forms.Form):
    header = 'Clean Project (delete project content or project itself as well.'
    _project = forms.ModelChoiceField(queryset=Project.objects.all(), required=True)
    delete = checkbox_field("Delete Project itself as well.", initial=True)

    def clean(self):
        cleaned_data = super().clean()
        cleaned_data['_project_id'] = cleaned_data['_project'].pk
        del cleaned_data['_project']
Ejemplo n.º 10
0
class ProcessLeaseDocumentsForm(forms.Form):
    header = 'Detect and Process Lease Documents'
    no_detect = checkbox_field("All documents in DB are lease agreements (disable detection)")
    delete = checkbox_field("Delete existing lease documents data")

    def _post_clean(self):
        super()._post_clean()
        self.cleaned_data['module_name'] = MODULE_NAME

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.fields['document_type'] = forms.MultipleChoiceField(
            choices=[(t, t) for t in Document.objects
                .order_by().values_list('document_type', flat=True).distinct()],
            widget=forms.SelectMultiple(attrs={'class': 'chosen'}),
            required=False)
        self.fields = OrderedDict((k, self.fields[k])
                                  for k in ['document_type', 'no_detect', 'delete'])
Ejemplo n.º 11
0
class SimilarityForm(forms.Form):
    header = 'Identify similar Documents and/or Text Units.'
    search_similar_documents = checkbox_field("Identify similar Documents.",
                                              input_class='min-one-of',
                                              initial=True)
    search_similar_text_units = checkbox_field("Identify similar Text Units.",
                                               input_class='min-one-of')
    similarity_threshold = forms.IntegerField(
        min_value=50,
        max_value=100,
        initial=75,
        required=True,
        help_text=_("Min. Similarity Value 50-100%"))
    use_idf = checkbox_field("Use TF-IDF to normalize data")
    delete = checkbox_field("Delete existing Similarity objects.",
                            initial=True)
    project = forms.ModelChoiceField(
        queryset=Project.objects.order_by('-pk'),
        widget=forms.widgets.Select(attrs={'class': 'chosen'}),
        required=False,
        label='Restrict to project')
Ejemplo n.º 12
0
class LoadDocumentsForm(forms.Form):
    header = 'Parse documents to create Documents and Text Units.'
    source_path = forms.CharField(max_length=1000,
                                  required=True,
                                  help_text='''
        Relative path to a folder with uploaded files.\n
        You can choose any folder or file in "/media/%s" folder.\n
        For example, "new" or "/".\n
        Create new folders and upload new documents if needed.''' %
                                  settings.FILEBROWSER_DIRECTORY)
    source_type = forms.CharField(max_length=100, required=True)
    document_type = forms.ModelChoiceField(queryset=DocumentType.objects.all(),
                                           required=False)
    delete = checkbox_field("Delete existing Documents")
Ejemplo n.º 13
0
class BaseRunClassifierForm(forms.Form):
    min_confidence = forms.IntegerField(
        min_value=0,
        max_value=100,
        initial=90,
        required=True,
        help_text='Store values with confidence greater than (%).')
    delete_suggestions = checkbox_field(
        "Delete ClassifierSuggestions of Classifier specified above.")
    project = forms.ModelChoiceField(
        queryset=Project.objects.order_by('-pk'),
        widget=forms.widgets.Select(attrs={'class': 'chosen'}),
        required=True,
        label='Restrict to project')
    field_order = [
        'classifier', 'project', 'min_confidence', 'delete_suggestions'
    ]
Ejemplo n.º 14
0
class ExistedClassifierClassifyForm(forms.Form):
    header = 'Classify Text Units using an existing Classifier.'
    classifier = forms.ChoiceField(
        choices=[(c.pk, c.name) for c in TextUnitClassifier.objects.filter(is_active=True)],
        widget=forms.widgets.Select(attrs={'class': 'chosen'}),
        required=True)
    sample_size = forms.IntegerField(
        min_value=1,
        required=False,
        help_text='Number of Documents to process. Leave blank to process all Documents.')
    min_confidence = forms.IntegerField(
        min_value=0,
        max_value=100,
        initial=90,
        required=True,
        help_text='Store values with confidence greater than (%).')
    delete_suggestions = checkbox_field(
        "Delete ClassifierSuggestions of Classifier specified above.")
Ejemplo n.º 15
0
class LocateForm(forms.Form):
    header = 'Locate specific terms in existing text units.'

    locate_all = checkbox_field(
        label="Locate all items / Reverse choice",
        label_class='main-label')

    geoentity_locate = locate_field("Geo Entities and Geo Aliases", parent_class='')
    geoentity_priority = child_field(
        label="Use first entity occurrence to resolve ambiguous entities",
        child_class='')
    geoentity_delete = child_field(
        label="Delete existing Geo Entity Usages and Geo Alias Usages",
        child_class='')

    date_locate = locate_field(label='Dates', parent_class='')
    date_strict = child_field(label="Strict", child_class='')
    date_delete = child_field("Date", child_class='')

    amount_locate = locate_field('Amounts')
    amount_delete = child_field("Amount")

    citation_locate = locate_field("Citations")
    citation_delete = child_field("Citation")

    copyright_locate = locate_field("Copyrights")
    copyright_delete = child_field("Copyright")

    court_locate = locate_field('Courts')
    court_delete = child_field('Court')

    currency_locate = locate_field('Currencies')
    currency_delete = child_field('Currency')

    duration_locate = locate_field('Date Durations')
    duration_delete = child_field('Date Duration')

    definition_locate = locate_field('Definitions')
    definition_delete = child_field('Definition')

    distance_locate = locate_field('Distances')
    distance_delete = child_field('Distance')

    party_locate = locate_field('Parties')
    party_delete = child_field('Parties and Party Usages')

    percent_locate = locate_field('Percents')
    percent_delete = child_field('Percent')

    ratio_locate = locate_field('Ratios')
    ratio_delete = child_field('Ratio')

    regulation_locate = locate_field('Regulations')
    regulation_delete = child_field('Regulation')

    term_locate = locate_field('Terms')
    term_delete = child_field('Term')

    trademark_locate = locate_field('Trademarks')
    trademark_delete = child_field('Trademark')

    url_locate = locate_field('Urls')
    url_delete = child_field('Url')

    parse = LTRRadioField(
        choices=(('paragraphs', 'Parse Text Units with "paragraph" type'),
                 ('sentences', 'Parse Text Units with both "paragraph" and "sentence" types')),
        help_text='Warning! Parsing both "paragraph" and "sentence" Text Unit types'
                  ' will take much more time',
        initial='paragraphs',
        required=False)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        for field in list(self.fields.keys()):
            if field in ['parse', 'locate_all']:
                continue
            field_name = field.split('_')[0]
            available_locators = list(settings.REQUIRED_LOCATORS) + list(config.standard_optional_locators)
            if field_name not in available_locators:
                del self.fields[field]
Ejemplo n.º 16
0
class ClusterForm(forms.Form):
    header = 'Clustering Documents and/or Text Units by Terms, Entities or Parties.'
    do_cluster_documents = checkbox_field("Cluster Documents",
                                          initial=True,
                                          input_class='max-one-of')
    do_cluster_text_units = checkbox_field("Cluster Text Units",
                                           input_class='max-one-of')
    project = forms.ModelChoiceField(
        queryset=Project.objects.order_by('-pk'),
        widget=forms.widgets.Select(attrs={'class': 'chosen'}),
        required=True,
        label='Restrict to project')
    cluster_by = forms.MultipleChoiceField(
        widget=forms.SelectMultiple(attrs={'class': 'chosen'}),
        choices=[(i, i) for i in DocumentFeatures.source_fields],
        initial='term',
        required=True,
        help_text='Cluster by terms, parties or other fields.')
    using = forms.ChoiceField(
        label='Algorithm',
        choices=[
            ('minibatchkmeans', 'MiniBatchKMeans'),
            ('kmeans', 'KMeans'),
            ('birch', 'Birch'),
            ('dbscan', 'DBSCAN'),
            # ('LabelSpreading', 'LabelSpreading')
        ],
        required=True,
        initial='minidatchkmeans',
        help_text='Clustering algorithm model name.')
    n_clusters = forms.IntegerField(label='n_clusters',
                                    min_value=1,
                                    initial=3,
                                    required=True,
                                    help_text='Number of clusters.')
    name = forms.CharField(max_length=100, required=True)
    description = forms.CharField(max_length=200, required=False)
    options = forms.BooleanField(**options_field_kwargs)
    kmeans_max_iter = forms.IntegerField(
        label='max_iter',
        min_value=1,
        initial=100,
        required=True,
        help_text='Maximum number of iterations for a single run.')
    kmeans_n_init = forms.IntegerField(
        label='n_init',
        min_value=1,
        initial=10,
        required=True,
        help_text=
        'Number of time the k-means algorithm will be run with different centroid seeds. '
        'The final results will be the best output of n_init consecutive runs in '
        'terms of inertia.')
    minibatchkmeans_batch_size = forms.IntegerField(
        label='batch_size',
        min_value=1,
        initial=100,
        required=True,
        help_text='Size of the mini batches.')
    birch_threshold = forms.FloatField(
        label='threshold',
        min_value=0,
        initial=0.5,
        required=True,
        help_text=
        'The radius of the subcluster obtained by merging a new sample and the closest '
        'subcluster should be lesser than the threshold.'
        ' Otherwise a new subcluster is started.')
    birch_branching_factor = forms.IntegerField(
        label='branching_factor',
        min_value=1,
        initial=50,
        required=True,
        help_text='Maximum number of CF subclusters in each node.')
    dbscan_eps = forms.FloatField(
        label='eps',
        min_value=0,
        initial=0.5,
        required=True,
        help_text=
        'The maximum distance between two samples for them to be considered '
        'as in the same neighborhood.')
    dbscan_leaf_size = forms.IntegerField(
        label='leaf_size',
        min_value=1,
        initial=30,
        required=True,
        help_text='Leaf size passed to BallTree or cKDTree. '
        'This can affect the speed of the construction and query, '
        'as well as the memory required to store the tree.')
    dbscan_p = forms.FloatField(
        label='p',
        min_value=0,
        required=False,
        help_text='Leaf size passed to BallTree or cKDTree. '
        'This can affect the speed of the construction and query, '
        'as well as the memory required to store the tree.')

    # ls_documents_property = forms.Field()
    # ls_text_units_property = forms.Field()
    # ls_max_iter = forms.IntegerField(
    #     label='max_iter',
    #     min_value=1,
    #     initial=5,
    #     required=True,
    #     help_text='Maximum number of iterations allowed.')

    # delete_type = checkbox_field(
    #     'Delete existed Clusters of the "Cluster By" and "Algorithm" specified above',
    #     input_class='max-one-of')
    # delete = checkbox_field("Delete all existed Clusters", input_class='max-one-of')

    # def __init__(self, *args, **kwargs):
    #     super().__init__(*args, **kwargs)
    #     if TextUnitProperty.objects.exists():
    #         choices = [(p, p) for p in sorted(
    #             set(TextUnitProperty.objects.values_list('key', flat=True)),
    #             key=lambda i: i.lower())]
    #         self.fields['ls_text_units_property'] = forms.ChoiceField(
    #             label='Text Unit Property Name',
    #             widget=forms.widgets.Select(attrs={'class': 'chosen'}),
    #             choices=choices,
    #             required=True,
    #             initial=choices[0][0])
    #     else:
    #         del self.fields['ls_text_units_property']
    #     if DocumentProperty.objects.exists():
    #         choices = [(p, p) for p in sorted(
    #             set(DocumentProperty.objects.values_list('key', flat=True)),
    #             key=lambda i: i.lower())]
    #         self.fields['ls_documents_property'] = forms.ChoiceField(
    #             label='Document Property Name',
    #             widget=forms.widgets.Select(attrs={'class': 'chosen'}),
    #             choices=choices,
    #             required=True,
    #             initial=choices[0][0])
    #     else:
    #         del self.fields['ls_documents_property']
    #     if not DocumentProperty.objects.exists() and not TextUnitProperty.objects.exists():
    #         self.fields['using'].choices = self.fields['using'].choices[:-1]

    def clean(self):
        cleaned_data = super().clean()
        do_cluster_documents = cleaned_data.get("do_cluster_documents")
        do_cluster_text_units = cleaned_data.get("do_cluster_text_units")
        if not any([do_cluster_documents, do_cluster_text_units]):
            self.add_error('do_cluster_documents',
                           'Please choose either Documents or Text Units')
            self.add_error('do_cluster_text_units',
                           'Please choose either Documents or Text Units')
Ejemplo n.º 17
0
class BaseTrainClassifierForm(forms.Form):
    options = forms.BooleanField(**options_field_kwargs)
    svc_C = forms.FloatField(
        label='C',
        min_value=0,
        initial=1.0,
        required=True,
        help_text='Penalty parameter C of the error term.')
    svc_kernel = forms.ChoiceField(
        label='kernel',
        choices=[('rbf', 'rbf'), ('linear', 'linear'), ('poly', 'poly'),
                 ('sigmoid', 'sigmoid'), ('precomputed', 'precomputed')],
        required=True,
        initial='rbf',
        help_text='Specifies the kernel type to be used in the algorithm.')
    svc_gamma = forms.CharField(
        label='gamma',
        max_length=6,
        initial='scale',
        required=False,
        help_text="{'scale', 'auto'} or float, optional (default='scale')."
        " Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.")
    mnb_alpha = forms.FloatField(
        label='alpha',
        min_value=0,
        initial=1.0,
        required=True,
        help_text=
        'Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).'
    )
    rfc_etc_n_estimators = forms.IntegerField(
        label='n_estimators',
        min_value=1,
        initial=10,
        required=True,
        help_text='The number of trees in the forest.')
    rfc_etc_criterion = forms.ChoiceField(
        label='criterion',
        choices=[('gini', 'gini'), ('entropy', 'entropy')],
        required=True,
        initial='gini',
        help_text='The function to measure the quality of a split.')
    rfc_etc_max_features = forms.IntegerField(
        label='max_features',
        min_value=1,
        required=False,
        help_text=
        'The number of features to consider when looking for the best split.'
        ' Integer or blank for "auto".')
    rfc_etc_max_depth = forms.IntegerField(
        label='max_depth',
        min_value=1,
        required=False,
        help_text='The maximum depth of the tree.'
        ' If None, then nodes are expanded until all leaves are pure'
        ' or until all leaves contain less than min_samples_split samples.')
    rfc_etc_min_samples_split = forms.IntegerField(
        label='min_samples_split',
        min_value=1,
        initial=2,
        required=True,
        help_text=
        'The minimum number of samples required to split an internal node.')
    rfc_etc_min_samples_leaf = forms.IntegerField(
        label='min_samples_leaf',
        min_value=1,
        initial=1,
        required=True,
        help_text='The minimum number of samples required to be at a leaf node.'
    )
    lrcv_Cs = forms.IntegerField(
        label='Cs',
        min_value=1,
        initial=10,
        required=True,
        help_text=
        'Each of the values in Cs describes the inverse of regularization strength.'
    )
    lrcv_fit_intercept = forms.BooleanField(
        label='fit_intercept',
        required=False,
        help_text='Specifies if a constant (a.k.a. bias or intercept)'
        ' should be added to the decision function.')
    lrcv_multi_class = forms.ChoiceField(
        label='multi_class',
        choices=[('ovr', 'ovr'), ('multinomial', 'multinomial')],
        required=True,
        initial='ovr',
        help_text=
        'If the option chosen is ‘ovr’, then a binary problem is fit for each label. '
        'Else the loss minimised is the multinomial loss fit across the '
        'entire probability distribution. '
        'Works only for the ‘newton-cg’, ‘sag’ and ‘lbfgs’ solver.')
    lrcv_solver = forms.ChoiceField(
        label='solver',
        choices=[('lbfgs', 'lbfgs'), ('newton-cg', 'newton-cg'),
                 ('liblinear', 'liblinear'), ('sag', 'sag')],
        required=True,
        initial='lbfgs',
        help_text='Algorithm to use in the optimization problem.')

    class_name = forms.ChoiceField(choices=[],
                                   required=True,
                                   help_text='Classifier class name')
    classifier_name = forms.CharField(max_length=100,
                                      required=True,
                                      help_text='Classifier name')
    use_tfidf = checkbox_field("Use TF-IDF to normalize data")
    delete_classifier = checkbox_field(
        "Delete existing Classifiers of class name specified above.")
    project = forms.ModelMultipleChoiceField(
        queryset=Project.objects.all().order_by('-pk'),
        widget=forms.SelectMultiple(attrs={'class': 'chosen compact'}),
        required=False,
        label='Restrict to project')
    project_name_filter = forms.CharField(
        max_length=100,
        required=False,
        help_text=
        'Project name filter as an alternative to "Project" option. Use "%" as a wildcard symbol.'
    )
    classify_by = forms.MultipleChoiceField(
        widget=forms.SelectMultiple(attrs={'class': 'chosen'}),
        choices=[(i, i) for i in DocumentFeatures.source_fields],
        initial='term',
        required=True,
        help_text='Classify by terms, parties or other fields.')
    algorithm = forms.ChoiceField(choices=CLASSIFIER_NAME_CHOICES,
                                  required=True,
                                  initial='RandomForestClassifier',
                                  help_text='Classifier algorithm name.')
    metric_pos_label = forms.CharField(
        max_length=100,
        required=False,
        help_text=
        'Positive label for "f1", "precision", "recall" accuracy metrics.')
    unit_type = forms.ChoiceField(choices=[('sentence', 'sentence'),
                                           ('paragraph', 'paragraph')],
                                  required=True,
                                  initial='sentence',
                                  help_text='Text Unit type.')
    field_order = [
        'project', 'project_name_filter', 'unit_type', 'algorithm',
        'use_tfidf', 'class_name', 'classify_by', 'classifier_name',
        'metric_pos_label', 'delete_classifier'
    ]

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.fields['class_name'] = forms.ChoiceField(
            choices=[(class_name, class_name) for class_name in set(
                self.classification_db_model.objects.values_list('class_name',
                                                                 flat=True))],
            required=True,
            help_text='Classification class name')

    def clean_svc_gamma(self):
        svc_gamma = self.cleaned_data['svc_gamma']
        try:
            svc_gamma = float(svc_gamma)
        except ValueError:
            pass
        return svc_gamma
Ejemplo n.º 18
0
class LocateForm(forms.Form):
    header = 'Locate specific terms in existing text units.'

    locate_all = checkbox_field(label="Locate all items / Reverse choice",
                                label_class='main-label')

    geoentity_locate = locate_field("Geo Entities and Geo Aliases",
                                    parent_class='')
    geoentity_priority = child_field(
        label="Use first entity occurrence to resolve ambiguous entities",
        child_class='')
    geoentity_delete = child_field(
        label="Delete existing Geo Entity Usages and Geo Alias Usages",
        child_class='')

    date_locate = locate_field(label='Dates', parent_class='')
    date_strict = child_field(label="Strict", child_class='')
    date_delete = child_field("Date", child_class='')

    amount_locate = locate_field('Amounts')
    amount_delete = child_field("Amount")

    citation_locate = locate_field("Citations")
    citation_delete = child_field("Citation")

    copyright_locate = locate_field("Copyrights")
    copyright_delete = child_field("Copyright")

    court_locate = locate_field('Courts')
    court_delete = child_field('Court')

    currency_locate = locate_field('Currencies')
    currency_delete = child_field('Currency')

    duration_locate = locate_field('Date Durations')
    duration_delete = child_field('Date Duration')

    definition_locate = locate_field('Definitions')
    definition_delete = child_field('Definition')

    distance_locate = locate_field('Distances')
    distance_delete = child_field('Distance')

    party_locate = locate_field('Parties')
    party_delete = child_field('Parties and Party Usages')

    percent_locate = locate_field('Percents')
    percent_delete = child_field('Percent')

    ratio_locate = locate_field('Ratios')
    ratio_delete = child_field('Ratio')

    regulation_locate = locate_field('Regulations')
    regulation_delete = child_field('Regulation')

    term_locate = locate_field('Terms')
    term_delete = child_field('Term')

    trademark_locate = locate_field('Trademarks')
    trademark_delete = child_field('Trademark')

    url_locate = locate_field('Urls')
    url_delete = child_field('Url')

    parse = forms.MultipleChoiceField(widget=LTRCheckgroupWidget(),
                                      choices=(('sentence',
                                                'Find in sentences'),
                                               ('paragraph',
                                                'Find in paragraphs')),
                                      label="Text units where to find terms")
    '''
    parse = LTRRadioField(
        choices=(('sentence', 'Parse Text Units with "sentence" types'),
                 ('paragraph', 'Parse Text Units with "paragraph" type')),
        initial='sentence',
        required=False)
    '''

    project = forms.ModelChoiceField(queryset=Project.objects.order_by('-pk'),
                                     required=False)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        from apps.extract.app_vars import STANDARD_LOCATORS, OPTIONAL_LOCATORS
        available_locators = set(STANDARD_LOCATORS.val) | set(
            OPTIONAL_LOCATORS.val)

        for field in list(self.fields.keys()):
            if field in ['parse', 'locate_all', 'project']:
                continue
            field_name = field.split('_')[0]
            if field_name not in available_locators:
                del self.fields[field]

    def is_valid(self):
        is_form_valid = super(LocateForm, self).is_valid()

        # check at least one "locate" choice is selected
        has_locate_chosen = bool([
            1 for k, v in self.cleaned_data.items()
            if 'locate' in k and v is True
        ])
        if has_locate_chosen is False:
            self.add_error('locate_all', 'Please choose a locator.')

        if not is_form_valid:
            return False

        # check at least one "parse" choice is selected
        if 'parse' not in self.cleaned_data or not self.cleaned_data['parse']:
            return False
        return True
Ejemplo n.º 19
0
class CreateClassifierClassifyForm(forms.Form):
    header = 'Classify Text Units by creating a new Classifier.'
    CLASSIFIER_NAME_CHOICES = (
        ('LogisticRegressionCV', 'LogisticRegressionCV'),
        ('MultinomialNB', 'MultinomialNB'),
        ('ExtraTreesClassifier', 'ExtraTreesClassifier'),
        ('RandomForestClassifier', 'RandomForestClassifier'),
        ('SVC', 'SVC'),
    )
    classify_by = forms.ChoiceField(
        choices=[('terms', 'Terms'),
                 ('parties', 'Parties'),
                 ('entities', 'Geo Entities')],
        required=True,
        help_text='Classify using terms, parties or geo entities.')
    algorithm = forms.ChoiceField(
        choices=CLASSIFIER_NAME_CHOICES,
        required=True,
        initial='LogisticRegressionCV',
        help_text='Text Unit Classifier name')
    class_name = forms.ChoiceField(
        choices=[(class_name, class_name) for class_name in
                 set(TextUnitClassification.objects.values_list('class_name', flat=True))],
        required=True,
        help_text='Text Unit class name')
    sample_size = forms.IntegerField(
        min_value=1,
        required=False,
        help_text='Number of Documents to process. Leave blank to process all Documents.')
    min_confidence = forms.IntegerField(
        min_value=0,
        max_value=100,
        initial=90,
        required=True,
        help_text='Store values with confidence greater than (%).')
    options = forms.BooleanField(**options_field_kwargs)
    svc_c = forms.FloatField(
        label='C',
        min_value=0,
        initial=1.0,
        required=True,
        help_text='Penalty parameter C of the error term.')
    svc_kernel = forms.ChoiceField(
        label='kernel',
        choices=[('rbf', 'rbf'),
                 ('linear', 'linear'),
                 ('poly', 'poly'),
                 ('sigmoid', 'sigmoid'),
                 ('precomputed', 'precomputed')],
        required=True,
        initial='rbf',
        help_text='Specifies the kernel type to be used in the algorithm.')
    svc_gamma = forms.FloatField(
        label='gamma',
        min_value=0,
        required=False,
        help_text='Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. '
                  'If gamma is ‘auto’ then 1/n_features will be used instead.')
    mnb_alpha = forms.FloatField(
        label='alpha',
        min_value=0,
        initial=1.0,
        required=True,
        help_text='Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).')
    rfc_etc_n_estimators = forms.IntegerField(
        label='n_estimators',
        min_value=1,
        initial=10,
        required=True,
        help_text='The number of trees in the forest.')
    rfc_etc_criterion = forms.ChoiceField(
        label='criterion',
        choices=[('gini', 'gini'),
                 ('entropy', 'entropy')],
        required=True,
        initial='gini',
        help_text='The function to measure the quality of a split.')
    rfc_etc_max_features = forms.IntegerField(
        label='max_features',
        min_value=1,
        required=False,
        help_text='The number of features to consider when looking for the best split.'
                  ' Integer or blank for "auto".')
    rfc_etc_max_depth = forms.IntegerField(
        label='max_depth',
        min_value=1,
        required=False,
        help_text='The maximum depth of the tree.'
                  ' If None, then nodes are expanded until all leaves are pure'
                  ' or until all leaves contain less than min_samples_split samples.')
    rfc_etc_min_samples_split = forms.IntegerField(
        label='min_samples_split',
        min_value=1,
        initial=2,
        required=True,
        help_text='The minimum number of samples required to split an internal node.')
    rfc_etc_min_samples_leaf = forms.IntegerField(
        label='min_samples_leaf',
        min_value=1,
        initial=1,
        required=True,
        help_text='The minimum number of samples required to be at a leaf node.')
    lrcv_cs = forms.IntegerField(
        label='cs',
        min_value=1,
        initial=10,
        required=True,
        help_text='Each of the values in Cs describes the inverse of regularization strength.')
    lrcv_fit_intercept = forms.BooleanField(
        label='fit_intercept',
        required=False,
        help_text='Specifies if a constant (a.k.a. bias or intercept)'
                  ' should be added to the decision function.')
    lrcv_multi_class = forms.ChoiceField(
        label='multi_class',
        choices=[('ovr', 'ovr'),
                 ('multinomial', 'multinomial')],
        required=True,
        initial='ovr',
        help_text='If the option chosen is ‘ovr’, then a binary problem is fit for each label. '
                  'Else the loss minimised is the multinomial loss fit across the '
                  'entire probability distribution. '
                  'Works only for the ‘newton-cg’, ‘sag’ and ‘lbfgs’ solver.')
    lrcv_solver = forms.ChoiceField(
        label='solver',
        choices=[('lbfgs', 'lbfgs'),
                 ('newton-cg', 'newton-cg'),
                 ('liblinear', 'liblinear'),
                 ('sag', 'sag')],
        required=True,
        initial='lbfgs',
        help_text='Algorithm to use in the optimization problem.')
    use_tfidf = checkbox_field(
        "Use TF-IDF to normalize data")
    delete_classifier = checkbox_field(
        "Delete existing Classifiers of class name specified above.")
    delete_suggestions = checkbox_field(
        "Delete ClassifierSuggestions of class name specified above.")

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.fields['class_name'] = forms.ChoiceField(
            choices=[(class_name, class_name) for class_name in
                     set(TextUnitClassification.objects.values_list('class_name', flat=True))],
            required=True,
            help_text='Text Unit class name')
Ejemplo n.º 20
0
def locate_field(label, parent_class='checkbox-parent'):
    return checkbox_field(label, input_class=parent_class)
Ejemplo n.º 21
0
class LocateTermsForm(forms.Form):
    header = 'Locate Terms in existing Text Units.'
    delete = checkbox_field("Delete existing Term Usages", initial=True)