Ejemplo n.º 1
0
    def handle(self, *args, **options):
        spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER, disable=['parser', 'tagger', 'textcat'])
        Span.set_extension('is_phone', getter=Command.is_phone_getter, force=True)
        Span.set_extension('line_number', getter=Command.line_number_getter, force=True)
        Doc.set_extension('lines', getter=Command.get_lines, setter=Command.set_lines)
        Doc.set_extension('_lines', default=list())

        logger.debug("Loaded spacy server")
        main_socks, read_socks, write_socks = socket_bind('', settings.SPACY_PORT)
        while True:
            readable, writeable, exceptions = select(read_socks, write_socks, [])
            for sockobj in readable:
                if sockobj in main_socks:
                    new_sock, address = sockobj.accept()
                    logger.debug('Connect: %s - %s', address, id(new_sock))
                    read_socks.append(new_sock)
                else:
                    try:
                        entities = []
                        data = recv_end(sockobj)
                        if not data:
                            sockobj.close()
                            read_socks.remove(sockobj)
                        else:
                            for doc in spacy_model.pipe([data]):
                                doc._.lines = [x.start() for x in re.finditer('\n', doc.text)]
                                for ent in doc.ents:
                                    current_entity = self.get_ent(ent)
                                    entities.append(current_entity) if current_entity else None

                            sockobj.sendall(json.dumps(entities).encode('utf8') + '--end--'.encode('utf8'))
                    except:
                        pass
Ejemplo n.º 2
0
    def set_attributes(self, schema_file: Union[str, Path] = '', encoding: str = None) -> Set:
        """


        The current version SpaCy doesn't differentiate attributes for different annotation types.
        Thus, any attributes extended here will be applied to all Spans.
        @param schema_file: initiate Span attributes using eHOST schema configuration file
        @param encoding: text encoding
        @return: a set of attribute names
        """
        schema_file = self.check_file_validity(schema_file, False)
        attr_names = set()
        attr_conf_start = False
        if schema_file is not None and schema_file.name.endswith("conf"):
            for row in schema_file.read_text(encoding=encoding).split("\n"):
                if len(row.strip()) == 0 or row[0] == '#':
                    continue
                if row.startswith(r'[attributes]'):
                    attr_conf_start = True
                    continue
                elif row[0] == '[':
                    attr_conf_start = False
                if attr_conf_start:
                    # [attributes]
                    # Negation        Arg:<EVENT>
                    # Confidence        Arg:<EVENT>, Value:Possible|Likely|Certain
                    name = row.split('        ')[0]
                    default_value = None
                    if name not in attr_names and not Span.has_extension(name):
                        Span.set_extension(name, default=default_value)
                        attr_names.add(name)
            self.schema_set = True
        return attr_names
Ejemplo n.º 3
0
    def __init__(self, nlp: Language = None, support_overlap: bool = False,
                 log_level: int = logging.WARNING, encoding: str = None, doc_name_depth: int = 0,
                 schema_file: Union[str, Path] = '', store_anno_string: bool = False,
                 **kwargs):
        """

        @param nlp: Spacy Language model
        @param support_overlap: whether need to support overlapped annotations
        @param log_level: logging level configuration
        @param encoding: txt encoding
        @param doc_name_depth: depth of parent directories to add into doc_name
                default is 0: only use file name
                1: use 1 level parent directory name + file name
                -1: use full absolution path
                if you are dealing with multiple directories,this is helpful to
                locate the original files
        @param schema_file: initiate Span attributes using eHOST schema configuration file
        @param store_anno_string: whether read annotated string from annotations to double check parsed Span's correction
        @param kwargs:other parameters
        """
        self.schema_set = False
        self.attr_names = self.set_attributes(schema_file=schema_file, encoding=encoding)
        if store_anno_string:
            if not Span.has_extension("span_txt"):
                Span.set_extension("span_txt", default="")
        super().__init__(nlp=nlp, support_overlap=support_overlap,
                         log_level=log_level, encoding=encoding, doc_name_depth=doc_name_depth,
                         schema_file=schema_file, store_anno_string=store_anno_string, **kwargs)
        pass
Ejemplo n.º 4
0
def add_span_extensions():
    Doc.set_extension("relations", default=None)
    Doc.set_extension("entities", default=None)
    for span_extension in [
            'entity_type', 'entity_id', 'foodon', 'hansard', 'hansardClosest',
            'hansardParent', 'snomedct', 'synonyms'
    ]:
        Span.set_extension(span_extension, default=None)
Ejemplo n.º 5
0
 def __init__(self, language: str = "es"):
     """
     Init method
     :param language: language of the annotation
     """
     self.__sentiment_words = load_dict(language, "sentiment_words.csv")
     self.__boosters = load_dict(language, "boosters.csv")
     self.__negations = load_dict(language, "negations.csv")
     Span.set_extension("sentiment_weight", default=0.0, force=True)
     Token.set_extension("sentiment_weight", default=0.0, force=True)
     Token.set_extension("negation_weight", default=1.0, force=True)
     Token.set_extension("booster_weight", default=0.0, force=True)
Ejemplo n.º 6
0
    def __init__(self,
                 nlp: Language = None,
                 support_overlap: bool = False,
                 log_level: int = logging.WARNING,
                 encoding: str = None,
                 doc_name_depth: int = 0,
                 schema_file: Union[str, Path] = '',
                 store_anno_string: bool = False,
                 use_adjudication: bool = False,
                 **kwargs):
        """

        @param nlp: a SpaCy language model
        @param support_overlap: if the EhostDocReader need to support reading from overlapped annotations.
            Because SpaCy's Doc.ents does not allows overlapped Spans, to support overlapping, Spans need to be stored
            somewhere else----Doc._.concepts
        @param log_level: set the logger's logging level. TO debug, set to logging.DEBUG
        @param encoding: txt encoding
        @param doc_name_depth: depth of parent directories to add into doc_name
                default is 0: only use file name
                1: use 1 level parent directory name + file name
                -1: use full absolution path
                if you are dealing with multiple directories,this is helpful to
                locate the original files
        @param schema_file: initiate Span attributes using eHOST schema configuration file
        @param store_anno_string: whether read annotated string from annotations to double check parsed Span's correction
        @param use_adjudication: if read annotations from adjudication folder
        @param kwargs:other parameters
        """
        self.schema_set = False
        self.attr_names = self.set_attributes(schema_file=schema_file,
                                              encoding=encoding)
        if store_anno_string:
            if not Span.has_extension("span_txt"):
                Span.set_extension("span_txt", default="")
        super().__init__(nlp=nlp,
                         support_overlap=support_overlap,
                         log_level=log_level,
                         encoding=encoding,
                         doc_name_depth=doc_name_depth,
                         schema_file=schema_file,
                         store_anno_string=store_anno_string,
                         use_adjudication=use_adjudication,
                         **kwargs)
        pass
Ejemplo n.º 7
0
    def __init__(self,
                 first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME,
                 last_name_extension_name=LastNameListMatcher.EXTENSION_NAME):

        self.token_extension_name = self.TOKEN_EXTENSION_NAME
        self.span_extension_name = self.SPAN_EXTENSION_NAME
        self.doc_extension_name = self.DOC_EXTENSION_NAME
        self.first_name_extension_name = first_name_extension_name
        self.last_name_extension_name = last_name_extension_name

        if not Token.has_extension(self.token_extension_name):
            Token.set_extension(self.token_extension_name,
                                default=self.ANOT_NONE)
        if not Span.has_extension(self.span_extension_name):
            Span.set_extension(self.span_extension_name,
                               getter=self.is_full_name_getter)
        if not Doc.has_extension(self.doc_extension_name):
            Doc.set_extension(self.doc_extension_name, default=[])
Ejemplo n.º 8
0
    def __init__(self, links, **kwargs):
        self.start_urls.append(links)

        import spacy
        from spacy.tokens.doc import Doc
        from spacy.tokens.span import Span

        self.spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER)
        Span.set_extension('line_number',
                           getter=TagLinkSpider.line_number_getter,
                           force=True)
        Doc.set_extension('lines',
                          getter=TagLinkSpider.get_lines,
                          setter=TagLinkSpider.set_lines)
        Doc.set_extension('_lines', default=list())

        self.soc_spacy = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.soc_spacy.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        connect(self.soc_spacy, '', settings.SPACY_PORT)

        super().__init__(**kwargs)
Ejemplo n.º 9
0
    def set_attributes(self,
                       schema_file: Union[str, Path] = '',
                       encoding: str = None) -> Set:
        """


        The current version SpaCy doesn't differentiate attributes for different annotation types.
        Thus, any attributes extended here will be applied to all Spans.
        @param schema_file: initiate Span attributes using eHOST schema configuration file
        @param encoding: text encoding
        @return: a set of attribute names
        """
        schema_file = self.check_file_validity(schema_file, False)
        attr_names = set()
        if schema_file is not None:
            root = etree.parse(str(schema_file.absolute()))
            for attr_def in root.iter("attributeDef"):
                name = attr_def[0].text.replace(' ', '_')
                default_value = attr_def[2].text
                if name not in attr_names and not Span.has_extension(name):
                    Span.set_extension(name, default=default_value)
                    attr_names.add(name)
            self.schema_set = True
        return attr_names
Ejemplo n.º 10
0
def enable_spacy_extensions():
    """Enables custom extensions for spaCy for dealing with citations."""
    Token.set_extension('is_in_text_citation', default=False, force=True)
    Span.set_extension('tokens_without_citations',
                       getter=get_span_tokens_without_citations,
                       force=True)
    Span.set_extension('text_without_citations',
                       getter=get_span_text_without_citations,
                       force=True)
    Span.set_extension('text_with_ws_without_citations',
                       getter=get_span_text_with_ws_wo_cites,
                       force=True)
Ejemplo n.º 11
0
import ahocorasick
import spacy
import textspan
from spacy.tokens import Doc
from spacy.tokens.span import Span
from spacy.util import filter_spans
from typing_extensions import Literal

from camphr.utils import SerializationMixin, get_doc_char_span

# Sometimes matched text is different from original text
# since `PatternSearcher` can match the `lemma`.
# This extension holds the matched text.
PATTERN_MATCH_AS = "pattern_match_as"
Span.set_extension(PATTERN_MATCH_AS, default=None, force=True)


@spacy.component("pattern_searcher")
class PatternSearcher(SerializationMixin):
    serialization_fields = [
        "model",
        "label_type",
        "custom_label",
        "custom_label_map",
        "destructive",
        "lemma",
        "lower",
        "cfg",
        "normalizer",
    ]
Ejemplo n.º 12
0

def doc_findall(doc: Doc, pattern, flags=0) -> List[Span]:
    return list(doc._.finditer(pattern, flags=flags))


Doc.set_extension('findall', method=doc_findall)

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Span extensions for standard regex functions 'finditer' and 'findall'
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #


def span_finditer(span: Span, pattern, flags=0):
    span_start_idx = span[0].idx
    for m in re.finditer(pattern=pattern, string=span.text, flags=flags):
        start, end = m.span()
        start += span_start_idx
        end += span_start_idx
        yield span.doc._.idxs2span(start, end)


Span.set_extension('finditer', method=span_finditer)


def span_findall(span, pattern, flags=0):
    return list(span._.finditer(pattern, flags=flags))


Span.set_extension('findall', method=span_findall)
Ejemplo n.º 13
0
    span_attn_getter,
    span_nctokens_getter,
    span_wp2ncid_getter,
    span_wp2tokid_getter,
    span_wp_getter,
    span_wp_slice_getter,
)

Doc.set_extension("wp2ncid", getter=doc_wp2ncid_getter)
Doc.set_extension("nctokens", getter=doc_nctokens_getter)
Doc.set_extension("tokid2nc", getter=doc_tokid2nc_getter)
Doc.set_extension("wp2tokid", getter=doc_wp2tokid_getter)
Doc.set_extension("tokid2ncid", getter=doc_tokid2ncid_getter)
Doc.set_extension("tokid2wp", getter=doc_tokid2wp_getter)

Span.set_extension("wp_slice", getter=span_wp_slice_getter)
Span.set_extension("wp2tokid", getter=span_wp2tokid_getter)
Span.set_extension("attention", getter=span_attn_getter)
Span.set_extension("wordpieces", getter=span_wp_getter)
Span.set_extension("wp2ncid", getter=span_wp2ncid_getter)
Span.set_extension("nctokens", getter=span_nctokens_getter)


def load_danish(spacy_model: str = "da_core_news_sm",
                transformer: str = "Maltehb/danish-bert-botxo"):
    nlp = spacy.load(spacy_model)

    if transformer:
        # add transformer
        # Construction via add_pipe with custom config
        config = {