def handle(self, *args, **options): spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER, disable=['parser', 'tagger', 'textcat']) Span.set_extension('is_phone', getter=Command.is_phone_getter, force=True) Span.set_extension('line_number', getter=Command.line_number_getter, force=True) Doc.set_extension('lines', getter=Command.get_lines, setter=Command.set_lines) Doc.set_extension('_lines', default=list()) logger.debug("Loaded spacy server") main_socks, read_socks, write_socks = socket_bind('', settings.SPACY_PORT) while True: readable, writeable, exceptions = select(read_socks, write_socks, []) for sockobj in readable: if sockobj in main_socks: new_sock, address = sockobj.accept() logger.debug('Connect: %s - %s', address, id(new_sock)) read_socks.append(new_sock) else: try: entities = [] data = recv_end(sockobj) if not data: sockobj.close() read_socks.remove(sockobj) else: for doc in spacy_model.pipe([data]): doc._.lines = [x.start() for x in re.finditer('\n', doc.text)] for ent in doc.ents: current_entity = self.get_ent(ent) entities.append(current_entity) if current_entity else None sockobj.sendall(json.dumps(entities).encode('utf8') + '--end--'.encode('utf8')) except: pass
def set_attributes(self, schema_file: Union[str, Path] = '', encoding: str = None) -> Set: """ The current version SpaCy doesn't differentiate attributes for different annotation types. Thus, any attributes extended here will be applied to all Spans. @param schema_file: initiate Span attributes using eHOST schema configuration file @param encoding: text encoding @return: a set of attribute names """ schema_file = self.check_file_validity(schema_file, False) attr_names = set() attr_conf_start = False if schema_file is not None and schema_file.name.endswith("conf"): for row in schema_file.read_text(encoding=encoding).split("\n"): if len(row.strip()) == 0 or row[0] == '#': continue if row.startswith(r'[attributes]'): attr_conf_start = True continue elif row[0] == '[': attr_conf_start = False if attr_conf_start: # [attributes] # Negation Arg:<EVENT> # Confidence Arg:<EVENT>, Value:Possible|Likely|Certain name = row.split(' ')[0] default_value = None if name not in attr_names and not Span.has_extension(name): Span.set_extension(name, default=default_value) attr_names.add(name) self.schema_set = True return attr_names
def __init__(self, nlp: Language = None, support_overlap: bool = False, log_level: int = logging.WARNING, encoding: str = None, doc_name_depth: int = 0, schema_file: Union[str, Path] = '', store_anno_string: bool = False, **kwargs): """ @param nlp: Spacy Language model @param support_overlap: whether need to support overlapped annotations @param log_level: logging level configuration @param encoding: txt encoding @param doc_name_depth: depth of parent directories to add into doc_name default is 0: only use file name 1: use 1 level parent directory name + file name -1: use full absolution path if you are dealing with multiple directories,this is helpful to locate the original files @param schema_file: initiate Span attributes using eHOST schema configuration file @param store_anno_string: whether read annotated string from annotations to double check parsed Span's correction @param kwargs:other parameters """ self.schema_set = False self.attr_names = self.set_attributes(schema_file=schema_file, encoding=encoding) if store_anno_string: if not Span.has_extension("span_txt"): Span.set_extension("span_txt", default="") super().__init__(nlp=nlp, support_overlap=support_overlap, log_level=log_level, encoding=encoding, doc_name_depth=doc_name_depth, schema_file=schema_file, store_anno_string=store_anno_string, **kwargs) pass
def add_span_extensions(): Doc.set_extension("relations", default=None) Doc.set_extension("entities", default=None) for span_extension in [ 'entity_type', 'entity_id', 'foodon', 'hansard', 'hansardClosest', 'hansardParent', 'snomedct', 'synonyms' ]: Span.set_extension(span_extension, default=None)
def __init__(self, language: str = "es"): """ Init method :param language: language of the annotation """ self.__sentiment_words = load_dict(language, "sentiment_words.csv") self.__boosters = load_dict(language, "boosters.csv") self.__negations = load_dict(language, "negations.csv") Span.set_extension("sentiment_weight", default=0.0, force=True) Token.set_extension("sentiment_weight", default=0.0, force=True) Token.set_extension("negation_weight", default=1.0, force=True) Token.set_extension("booster_weight", default=0.0, force=True)
def __init__(self, nlp: Language = None, support_overlap: bool = False, log_level: int = logging.WARNING, encoding: str = None, doc_name_depth: int = 0, schema_file: Union[str, Path] = '', store_anno_string: bool = False, use_adjudication: bool = False, **kwargs): """ @param nlp: a SpaCy language model @param support_overlap: if the EhostDocReader need to support reading from overlapped annotations. Because SpaCy's Doc.ents does not allows overlapped Spans, to support overlapping, Spans need to be stored somewhere else----Doc._.concepts @param log_level: set the logger's logging level. TO debug, set to logging.DEBUG @param encoding: txt encoding @param doc_name_depth: depth of parent directories to add into doc_name default is 0: only use file name 1: use 1 level parent directory name + file name -1: use full absolution path if you are dealing with multiple directories,this is helpful to locate the original files @param schema_file: initiate Span attributes using eHOST schema configuration file @param store_anno_string: whether read annotated string from annotations to double check parsed Span's correction @param use_adjudication: if read annotations from adjudication folder @param kwargs:other parameters """ self.schema_set = False self.attr_names = self.set_attributes(schema_file=schema_file, encoding=encoding) if store_anno_string: if not Span.has_extension("span_txt"): Span.set_extension("span_txt", default="") super().__init__(nlp=nlp, support_overlap=support_overlap, log_level=log_level, encoding=encoding, doc_name_depth=doc_name_depth, schema_file=schema_file, store_anno_string=store_anno_string, use_adjudication=use_adjudication, **kwargs) pass
def __init__(self, first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME, last_name_extension_name=LastNameListMatcher.EXTENSION_NAME): self.token_extension_name = self.TOKEN_EXTENSION_NAME self.span_extension_name = self.SPAN_EXTENSION_NAME self.doc_extension_name = self.DOC_EXTENSION_NAME self.first_name_extension_name = first_name_extension_name self.last_name_extension_name = last_name_extension_name if not Token.has_extension(self.token_extension_name): Token.set_extension(self.token_extension_name, default=self.ANOT_NONE) if not Span.has_extension(self.span_extension_name): Span.set_extension(self.span_extension_name, getter=self.is_full_name_getter) if not Doc.has_extension(self.doc_extension_name): Doc.set_extension(self.doc_extension_name, default=[])
def __init__(self, links, **kwargs): self.start_urls.append(links) import spacy from spacy.tokens.doc import Doc from spacy.tokens.span import Span self.spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER) Span.set_extension('line_number', getter=TagLinkSpider.line_number_getter, force=True) Doc.set_extension('lines', getter=TagLinkSpider.get_lines, setter=TagLinkSpider.set_lines) Doc.set_extension('_lines', default=list()) self.soc_spacy = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.soc_spacy.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) connect(self.soc_spacy, '', settings.SPACY_PORT) super().__init__(**kwargs)
def set_attributes(self, schema_file: Union[str, Path] = '', encoding: str = None) -> Set: """ The current version SpaCy doesn't differentiate attributes for different annotation types. Thus, any attributes extended here will be applied to all Spans. @param schema_file: initiate Span attributes using eHOST schema configuration file @param encoding: text encoding @return: a set of attribute names """ schema_file = self.check_file_validity(schema_file, False) attr_names = set() if schema_file is not None: root = etree.parse(str(schema_file.absolute())) for attr_def in root.iter("attributeDef"): name = attr_def[0].text.replace(' ', '_') default_value = attr_def[2].text if name not in attr_names and not Span.has_extension(name): Span.set_extension(name, default=default_value) attr_names.add(name) self.schema_set = True return attr_names
def enable_spacy_extensions(): """Enables custom extensions for spaCy for dealing with citations.""" Token.set_extension('is_in_text_citation', default=False, force=True) Span.set_extension('tokens_without_citations', getter=get_span_tokens_without_citations, force=True) Span.set_extension('text_without_citations', getter=get_span_text_without_citations, force=True) Span.set_extension('text_with_ws_without_citations', getter=get_span_text_with_ws_wo_cites, force=True)
import ahocorasick import spacy import textspan from spacy.tokens import Doc from spacy.tokens.span import Span from spacy.util import filter_spans from typing_extensions import Literal from camphr.utils import SerializationMixin, get_doc_char_span # Sometimes matched text is different from original text # since `PatternSearcher` can match the `lemma`. # This extension holds the matched text. PATTERN_MATCH_AS = "pattern_match_as" Span.set_extension(PATTERN_MATCH_AS, default=None, force=True) @spacy.component("pattern_searcher") class PatternSearcher(SerializationMixin): serialization_fields = [ "model", "label_type", "custom_label", "custom_label_map", "destructive", "lemma", "lower", "cfg", "normalizer", ]
def doc_findall(doc: Doc, pattern, flags=0) -> List[Span]: return list(doc._.finditer(pattern, flags=flags)) Doc.set_extension('findall', method=doc_findall) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Span extensions for standard regex functions 'finditer' and 'findall' # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def span_finditer(span: Span, pattern, flags=0): span_start_idx = span[0].idx for m in re.finditer(pattern=pattern, string=span.text, flags=flags): start, end = m.span() start += span_start_idx end += span_start_idx yield span.doc._.idxs2span(start, end) Span.set_extension('finditer', method=span_finditer) def span_findall(span, pattern, flags=0): return list(span._.finditer(pattern, flags=flags)) Span.set_extension('findall', method=span_findall)
span_attn_getter, span_nctokens_getter, span_wp2ncid_getter, span_wp2tokid_getter, span_wp_getter, span_wp_slice_getter, ) Doc.set_extension("wp2ncid", getter=doc_wp2ncid_getter) Doc.set_extension("nctokens", getter=doc_nctokens_getter) Doc.set_extension("tokid2nc", getter=doc_tokid2nc_getter) Doc.set_extension("wp2tokid", getter=doc_wp2tokid_getter) Doc.set_extension("tokid2ncid", getter=doc_tokid2ncid_getter) Doc.set_extension("tokid2wp", getter=doc_tokid2wp_getter) Span.set_extension("wp_slice", getter=span_wp_slice_getter) Span.set_extension("wp2tokid", getter=span_wp2tokid_getter) Span.set_extension("attention", getter=span_attn_getter) Span.set_extension("wordpieces", getter=span_wp_getter) Span.set_extension("wp2ncid", getter=span_wp2ncid_getter) Span.set_extension("nctokens", getter=span_nctokens_getter) def load_danish(spacy_model: str = "da_core_news_sm", transformer: str = "Maltehb/danish-bert-botxo"): nlp = spacy.load(spacy_model) if transformer: # add transformer # Construction via add_pipe with custom config config = {