Beispiel #1
0
    def set_attributes(self, schema_file: Union[str, Path] = '', encoding: str = None) -> Set:
        """


        The current version SpaCy doesn't differentiate attributes for different annotation types.
        Thus, any attributes extended here will be applied to all Spans.
        @param schema_file: initiate Span attributes using eHOST schema configuration file
        @param encoding: text encoding
        @return: a set of attribute names
        """
        schema_file = self.check_file_validity(schema_file, False)
        attr_names = set()
        attr_conf_start = False
        if schema_file is not None and schema_file.name.endswith("conf"):
            for row in schema_file.read_text(encoding=encoding).split("\n"):
                if len(row.strip()) == 0 or row[0] == '#':
                    continue
                if row.startswith(r'[attributes]'):
                    attr_conf_start = True
                    continue
                elif row[0] == '[':
                    attr_conf_start = False
                if attr_conf_start:
                    # [attributes]
                    # Negation        Arg:<EVENT>
                    # Confidence        Arg:<EVENT>, Value:Possible|Likely|Certain
                    name = row.split('        ')[0]
                    default_value = None
                    if name not in attr_names and not Span.has_extension(name):
                        Span.set_extension(name, default=default_value)
                        attr_names.add(name)
            self.schema_set = True
        return attr_names
Beispiel #2
0
    def __init__(self, nlp: Language = None, support_overlap: bool = False,
                 log_level: int = logging.WARNING, encoding: str = None, doc_name_depth: int = 0,
                 schema_file: Union[str, Path] = '', store_anno_string: bool = False,
                 **kwargs):
        """

        @param nlp: Spacy Language model
        @param support_overlap: whether need to support overlapped annotations
        @param log_level: logging level configuration
        @param encoding: txt encoding
        @param doc_name_depth: depth of parent directories to add into doc_name
                default is 0: only use file name
                1: use 1 level parent directory name + file name
                -1: use full absolution path
                if you are dealing with multiple directories,this is helpful to
                locate the original files
        @param schema_file: initiate Span attributes using eHOST schema configuration file
        @param store_anno_string: whether read annotated string from annotations to double check parsed Span's correction
        @param kwargs:other parameters
        """
        self.schema_set = False
        self.attr_names = self.set_attributes(schema_file=schema_file, encoding=encoding)
        if store_anno_string:
            if not Span.has_extension("span_txt"):
                Span.set_extension("span_txt", default="")
        super().__init__(nlp=nlp, support_overlap=support_overlap,
                         log_level=log_level, encoding=encoding, doc_name_depth=doc_name_depth,
                         schema_file=schema_file, store_anno_string=store_anno_string, **kwargs)
        pass
    def __call__(self, spacy_span: Span, describer=None):
        """
        convenient wrapper around make_issue if you are using spaCy

        usage example:

        ```python
        from spacy.tokens import Span
        from app.factor import SpacyFactor


        SOV = SpacyFactor(
            "subject_object_verb_spacing",
            "Keep the subject, verb, and object of a sentence close together to help the reader understand the sentence."
        )

        Span.set_extension("score", default=0)
        Span.set_extension("suggestions", default=[])

        doc = nlp("Holders of the Class A and Class B-1 certificates will be entitled to receive on each Payment Date, to the extent monies are available therefor (but not more than the Class A Certificate Balance or Class B-1 Certificate Balance then outstanding), a distribution.")
        score = analyze(doc)
        if score is not None:
            span = Span(doc, 0, len(doc))  # or whichever TOKENS are the issue (don't have to worry about character indexes)
            span._.score = score
            span._.suggestions = get_suggestions(doc)
            issues = SOV(span)
        ```
        """
        text, start, end = spacy_span.text, spacy_span.start_char, spacy_span.end_char
        score = spacy_span._.score if spacy_span.has_extension("score") else 0
        suggestions = (spacy_span._.suggestions
                       if spacy_span.has_extension("suggestions") else [])
        if describer:
            description = describer(spacy_span)
        else:
            description = self.description
        return make_issue(
            text,
            start,
            end,
            issue_type=self.issue_type,
            score=score,
            description=description,
            suggestions=suggestions,
        )
Beispiel #4
0
    def __init__(self,
                 nlp: Language = None,
                 support_overlap: bool = False,
                 log_level: int = logging.WARNING,
                 encoding: str = None,
                 doc_name_depth: int = 0,
                 schema_file: Union[str, Path] = '',
                 store_anno_string: bool = False,
                 use_adjudication: bool = False,
                 **kwargs):
        """

        @param nlp: a SpaCy language model
        @param support_overlap: if the EhostDocReader need to support reading from overlapped annotations.
            Because SpaCy's Doc.ents does not allows overlapped Spans, to support overlapping, Spans need to be stored
            somewhere else----Doc._.concepts
        @param log_level: set the logger's logging level. TO debug, set to logging.DEBUG
        @param encoding: txt encoding
        @param doc_name_depth: depth of parent directories to add into doc_name
                default is 0: only use file name
                1: use 1 level parent directory name + file name
                -1: use full absolution path
                if you are dealing with multiple directories,this is helpful to
                locate the original files
        @param schema_file: initiate Span attributes using eHOST schema configuration file
        @param store_anno_string: whether read annotated string from annotations to double check parsed Span's correction
        @param use_adjudication: if read annotations from adjudication folder
        @param kwargs:other parameters
        """
        self.schema_set = False
        self.attr_names = self.set_attributes(schema_file=schema_file,
                                              encoding=encoding)
        if store_anno_string:
            if not Span.has_extension("span_txt"):
                Span.set_extension("span_txt", default="")
        super().__init__(nlp=nlp,
                         support_overlap=support_overlap,
                         log_level=log_level,
                         encoding=encoding,
                         doc_name_depth=doc_name_depth,
                         schema_file=schema_file,
                         store_anno_string=store_anno_string,
                         use_adjudication=use_adjudication,
                         **kwargs)
        pass
Beispiel #5
0
    def __init__(self,
                 first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME,
                 last_name_extension_name=LastNameListMatcher.EXTENSION_NAME):

        self.token_extension_name = self.TOKEN_EXTENSION_NAME
        self.span_extension_name = self.SPAN_EXTENSION_NAME
        self.doc_extension_name = self.DOC_EXTENSION_NAME
        self.first_name_extension_name = first_name_extension_name
        self.last_name_extension_name = last_name_extension_name

        if not Token.has_extension(self.token_extension_name):
            Token.set_extension(self.token_extension_name,
                                default=self.ANOT_NONE)
        if not Span.has_extension(self.span_extension_name):
            Span.set_extension(self.span_extension_name,
                               getter=self.is_full_name_getter)
        if not Doc.has_extension(self.doc_extension_name):
            Doc.set_extension(self.doc_extension_name, default=[])
Beispiel #6
0
    def set_attributes(self,
                       schema_file: Union[str, Path] = '',
                       encoding: str = None) -> Set:
        """


        The current version SpaCy doesn't differentiate attributes for different annotation types.
        Thus, any attributes extended here will be applied to all Spans.
        @param schema_file: initiate Span attributes using eHOST schema configuration file
        @param encoding: text encoding
        @return: a set of attribute names
        """
        schema_file = self.check_file_validity(schema_file, False)
        attr_names = set()
        if schema_file is not None:
            root = etree.parse(str(schema_file.absolute()))
            for attr_def in root.iter("attributeDef"):
                name = attr_def[0].text.replace(' ', '_')
                default_value = attr_def[2].text
                if name not in attr_names and not Span.has_extension(name):
                    Span.set_extension(name, default=default_value)
                    attr_names.add(name)
            self.schema_set = True
        return attr_names