def handle(self, *args, **options): spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER, disable=['parser', 'tagger', 'textcat']) Span.set_extension('is_phone', getter=Command.is_phone_getter, force=True) Span.set_extension('line_number', getter=Command.line_number_getter, force=True) Doc.set_extension('lines', getter=Command.get_lines, setter=Command.set_lines) Doc.set_extension('_lines', default=list()) logger.debug("Loaded spacy server") main_socks, read_socks, write_socks = socket_bind('', settings.SPACY_PORT) while True: readable, writeable, exceptions = select(read_socks, write_socks, []) for sockobj in readable: if sockobj in main_socks: new_sock, address = sockobj.accept() logger.debug('Connect: %s - %s', address, id(new_sock)) read_socks.append(new_sock) else: try: entities = [] data = recv_end(sockobj) if not data: sockobj.close() read_socks.remove(sockobj) else: for doc in spacy_model.pipe([data]): doc._.lines = [x.start() for x in re.finditer('\n', doc.text)] for ent in doc.ents: current_entity = self.get_ent(ent) entities.append(current_entity) if current_entity else None sockobj.sendall(json.dumps(entities).encode('utf8') + '--end--'.encode('utf8')) except: pass
def set_attributes(self, schema_file: Union[str, Path] = '', encoding: str = None) -> Set: """ The current version SpaCy doesn't differentiate attributes for different annotation types. Thus, any attributes extended here will be applied to all Spans. @param schema_file: initiate Span attributes using eHOST schema configuration file @param encoding: text encoding @return: a set of attribute names """ schema_file = self.check_file_validity(schema_file, False) attr_names = set() attr_conf_start = False if schema_file is not None and schema_file.name.endswith("conf"): for row in schema_file.read_text(encoding=encoding).split("\n"): if len(row.strip()) == 0 or row[0] == '#': continue if row.startswith(r'[attributes]'): attr_conf_start = True continue elif row[0] == '[': attr_conf_start = False if attr_conf_start: # [attributes] # Negation Arg:<EVENT> # Confidence Arg:<EVENT>, Value:Possible|Likely|Certain name = row.split(' ')[0] default_value = None if name not in attr_names and not Span.has_extension(name): Span.set_extension(name, default=default_value) attr_names.add(name) self.schema_set = True return attr_names
def __init__(self, nlp: Language = None, support_overlap: bool = False, log_level: int = logging.WARNING, encoding: str = None, doc_name_depth: int = 0, schema_file: Union[str, Path] = '', store_anno_string: bool = False, **kwargs): """ @param nlp: Spacy Language model @param support_overlap: whether need to support overlapped annotations @param log_level: logging level configuration @param encoding: txt encoding @param doc_name_depth: depth of parent directories to add into doc_name default is 0: only use file name 1: use 1 level parent directory name + file name -1: use full absolution path if you are dealing with multiple directories,this is helpful to locate the original files @param schema_file: initiate Span attributes using eHOST schema configuration file @param store_anno_string: whether read annotated string from annotations to double check parsed Span's correction @param kwargs:other parameters """ self.schema_set = False self.attr_names = self.set_attributes(schema_file=schema_file, encoding=encoding) if store_anno_string: if not Span.has_extension("span_txt"): Span.set_extension("span_txt", default="") super().__init__(nlp=nlp, support_overlap=support_overlap, log_level=log_level, encoding=encoding, doc_name_depth=doc_name_depth, schema_file=schema_file, store_anno_string=store_anno_string, **kwargs) pass
def add_span_extensions(): Doc.set_extension("relations", default=None) Doc.set_extension("entities", default=None) for span_extension in [ 'entity_type', 'entity_id', 'foodon', 'hansard', 'hansardClosest', 'hansardParent', 'snomedct', 'synonyms' ]: Span.set_extension(span_extension, default=None)
def one_verb(i, sent, doc): front = "" if (i > 0): front = Span(doc, sent.start, i+sent.start) end = Span(doc, sent.start+i+1, sent.end-1) newfront = "" for k in front: if k.ent_type_ == "": newfront += k.lower_ + " " else: newfront += k.text + " " quest1 = "" quest2 = "" if sent[i].tag_ == "VBD" and sent[i].lemma_ != "be": if find_subject(sent) != "UNKNOWN": quest1 = "did"+ " " + find_subject(sent) + " " + sent[i].lemma_ quest2 = "did"+ " " + str(newfront[:-1]) + " " + sent[i].lemma_ elif sent[i].tag_ == "VBZ" and sent[i].lemma_ != "be": if find_subject(sent) != "UNKNOWN": quest1 = "does"+ " " + find_subject(sent) + " " + sent[i].lemma_ quest2 = "does"+ " " + str(newfront[:-1]) + " " + sent[i].lemma_ elif sent[i].tag_ == "VBP" or sent[i].tag_ == "VB": if find_subject(sent) != "UNKNOWN": quest1 = "do"+ " " + find_subject(sent) + " " + sent[i].lemma_ quest2 = "do"+ " " + str(newfront[:-1]) + " " + sent[i].lemma_ elif sent[i].tag_ == "VBZ" and sent[i].lemma_ == "be": if find_subject(sent) != "UNKNOWN": quest1 = sent[i].orth_ + " " + find_subject(sent) quest2 = sent[i].orth_ + " " + str(newfront[:-1]) elif sent[i].tag_ == "VBD" and sent[i].lemma_ == "be": if find_subject(sent) != "UNKNOWN": quest1 = "was"+ " " + find_subject(sent) quest2 = "was"+ " " + str(newfront[:-1]) elif sent[i].tag_ == "VBN" and sent[i].lemma_ == "be": if find_subject(sent) != "UNKNOWN": quest1 = sent[i].orth_ + " " + find_subject(sent) + " " + sent[i].lemma_ quest2 = sent[i].orth_ + " " + str(newfront[:-1]) + " " + sent[i].lemma_ elif sent[i].tag_ == "VBN" and sent[i].lemma_ == "have": if find_subject(sent) != "UNKNOWN": quest1 = sent[i].orth_ + " " + find_subject(sent) + " " + sent[i].lemma_ quest2 = sent[i].orth_ + " " + str(newfront[:-1]) + " " + sent[i].lemma_ quest1 += " " + str(end) + "?" quest2 += " " + str(end) + "?" return quest1, quest2
def __init__(self, language: str = "es"): """ Init method :param language: language of the annotation """ self.__sentiment_words = load_dict(language, "sentiment_words.csv") self.__boosters = load_dict(language, "boosters.csv") self.__negations = load_dict(language, "negations.csv") Span.set_extension("sentiment_weight", default=0.0, force=True) Token.set_extension("sentiment_weight", default=0.0, force=True) Token.set_extension("negation_weight", default=1.0, force=True) Token.set_extension("booster_weight", default=0.0, force=True)
def test_ensure_print_span_characteristics_wont_fail(): """Test if interface between two methods aren't destroyed if refactored""" nlp = English() spans_key = "sc" pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")] ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")] eg = Example(pred, ref) examples = [eg] data = _compile_gold(examples, ["spancat"], nlp, True) span_characteristics = _get_span_characteristics( examples=examples, compiled_gold=data, spans_key=spans_key ) _print_span_characteristics(span_characteristics)
def __init__(self, nlp: Language = None, support_overlap: bool = False, log_level: int = logging.WARNING, encoding: str = None, doc_name_depth: int = 0, schema_file: Union[str, Path] = '', store_anno_string: bool = False, use_adjudication: bool = False, **kwargs): """ @param nlp: a SpaCy language model @param support_overlap: if the EhostDocReader need to support reading from overlapped annotations. Because SpaCy's Doc.ents does not allows overlapped Spans, to support overlapping, Spans need to be stored somewhere else----Doc._.concepts @param log_level: set the logger's logging level. TO debug, set to logging.DEBUG @param encoding: txt encoding @param doc_name_depth: depth of parent directories to add into doc_name default is 0: only use file name 1: use 1 level parent directory name + file name -1: use full absolution path if you are dealing with multiple directories,this is helpful to locate the original files @param schema_file: initiate Span attributes using eHOST schema configuration file @param store_anno_string: whether read annotated string from annotations to double check parsed Span's correction @param use_adjudication: if read annotations from adjudication folder @param kwargs:other parameters """ self.schema_set = False self.attr_names = self.set_attributes(schema_file=schema_file, encoding=encoding) if store_anno_string: if not Span.has_extension("span_txt"): Span.set_extension("span_txt", default="") super().__init__(nlp=nlp, support_overlap=support_overlap, log_level=log_level, encoding=encoding, doc_name_depth=doc_name_depth, schema_file=schema_file, store_anno_string=store_anno_string, use_adjudication=use_adjudication, **kwargs) pass
def __call__(self, spacy_span: Span, describer=None): """ convenient wrapper around make_issue if you are using spaCy usage example: ```python from spacy.tokens import Span from app.factor import SpacyFactor SOV = SpacyFactor( "subject_object_verb_spacing", "Keep the subject, verb, and object of a sentence close together to help the reader understand the sentence." ) Span.set_extension("score", default=0) Span.set_extension("suggestions", default=[]) doc = nlp("Holders of the Class A and Class B-1 certificates will be entitled to receive on each Payment Date, to the extent monies are available therefor (but not more than the Class A Certificate Balance or Class B-1 Certificate Balance then outstanding), a distribution.") score = analyze(doc) if score is not None: span = Span(doc, 0, len(doc)) # or whichever TOKENS are the issue (don't have to worry about character indexes) span._.score = score span._.suggestions = get_suggestions(doc) issues = SOV(span) ``` """ text, start, end = spacy_span.text, spacy_span.start_char, spacy_span.end_char score = spacy_span._.score if spacy_span.has_extension("score") else 0 suggestions = (spacy_span._.suggestions if spacy_span.has_extension("suggestions") else []) if describer: description = describer(spacy_span) else: description = self.description return make_issue( text, start, end, issue_type=self.issue_type, score=score, description=description, suggestions=suggestions, )
def __init__(self, first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME, last_name_extension_name=LastNameListMatcher.EXTENSION_NAME): self.token_extension_name = self.TOKEN_EXTENSION_NAME self.span_extension_name = self.SPAN_EXTENSION_NAME self.doc_extension_name = self.DOC_EXTENSION_NAME self.first_name_extension_name = first_name_extension_name self.last_name_extension_name = last_name_extension_name if not Token.has_extension(self.token_extension_name): Token.set_extension(self.token_extension_name, default=self.ANOT_NONE) if not Span.has_extension(self.span_extension_name): Span.set_extension(self.span_extension_name, getter=self.is_full_name_getter) if not Doc.has_extension(self.doc_extension_name): Doc.set_extension(self.doc_extension_name, default=[])
def to_and_from(doc): new_ents = [] for ent in doc.ents: # Only check for title if it's a station and not the first token if ent.label_ == "STN" and ent.start != 0: prev_token = doc[ent.start - 1] if prev_token.text.lower() in ("to", "at"): new_ent = Span(doc, ent.start, ent.end, label="ARR") new_ents.append(new_ent) elif prev_token.text.lower() in "from": new_ent = Span(doc, ent.start, ent.end, label="DEP") new_ents.append(new_ent) else: new_ents.append(ent) else: new_ents.append(ent) doc.ents = new_ents return doc
def test_get_span_characteristics_return_value(): nlp = English() spans_key = "sc" pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")] ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")] eg = Example(pred, ref) examples = [eg] data = _compile_gold(examples, ["spancat"], nlp, True) span_characteristics = _get_span_characteristics( examples=examples, compiled_gold=data, spans_key=spans_key ) assert {"sd", "bd", "lengths"}.issubset(span_characteristics.keys()) assert span_characteristics["min_length"] == 1 assert span_characteristics["max_length"] == 3
def add_new_matching_span(self, document, span): token_end_index, token_start_index = self.calculate_span_indices(document, span) length = span[1] - span[0] if self.config.minimum_length is not None and self.config.minimum_length > length: return if self.config.maximum_length is not None and self.config.maximum_length < length: return spacy_span = Span(document, token_start_index, token_end_index, self.config.label) document.ents = document.ents + (spacy_span,)
def __init__(self, links, **kwargs): self.start_urls.append(links) import spacy from spacy.tokens.doc import Doc from spacy.tokens.span import Span self.spacy_model = spacy.load(settings.SPACY_CUSTOMN_MODEL_FOLDER) Span.set_extension('line_number', getter=TagLinkSpider.line_number_getter, force=True) Doc.set_extension('lines', getter=TagLinkSpider.get_lines, setter=TagLinkSpider.set_lines) Doc.set_extension('_lines', default=list()) self.soc_spacy = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.soc_spacy.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) connect(self.soc_spacy, '', settings.SPACY_PORT) super().__init__(**kwargs)
def time_detection(doc): new_ents = [] for ent in doc.ents: if ent.label_ == "CARDINAL": next_token = doc[ent.start + 1] if next_token.text.lower() in ("pm", "p.m", "p.m.", "am", "a.m", "a.m."): new_ent = Span(doc, ent.start, ent.end + 1, label="TIME") new_ents.append(new_ent) else: new_ents.append(ent) else: new_ents.append(ent) doc.ents = new_ents return doc
def set_attributes(self, schema_file: Union[str, Path] = '', encoding: str = None) -> Set: """ The current version SpaCy doesn't differentiate attributes for different annotation types. Thus, any attributes extended here will be applied to all Spans. @param schema_file: initiate Span attributes using eHOST schema configuration file @param encoding: text encoding @return: a set of attribute names """ schema_file = self.check_file_validity(schema_file, False) attr_names = set() if schema_file is not None: root = etree.parse(str(schema_file.absolute())) for attr_def in root.iter("attributeDef"): name = attr_def[0].text.replace(' ', '_') default_value = attr_def[2].text if name not in attr_names and not Span.has_extension(name): Span.set_extension(name, default=default_value) attr_names.add(name) self.schema_set = True return attr_names
def enable_spacy_extensions(): """Enables custom extensions for spaCy for dealing with citations.""" Token.set_extension('is_in_text_citation', default=False, force=True) Span.set_extension('tokens_without_citations', getter=get_span_tokens_without_citations, force=True) Span.set_extension('text_without_citations', getter=get_span_text_without_citations, force=True) Span.set_extension('text_with_ws_without_citations', getter=get_span_text_with_ws_wo_cites, force=True)
def test_debug_data_compile_gold_for_spans(): nlp = English() spans_key = "sc" pred = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) pred.spans[spans_key] = [Span(pred, 3, 6, "ORG"), Span(pred, 5, 6, "GPE")] ref = Doc(nlp.vocab, words=["Welcome", "to", "the", "Bank", "of", "China", "."]) ref.spans[spans_key] = [Span(ref, 3, 6, "ORG"), Span(ref, 5, 6, "GPE")] eg = Example(pred, ref) data = _compile_gold([eg], ["spancat"], nlp, True) assert data["spancat"][spans_key] == Counter({"ORG": 1, "GPE": 1}) assert data["spans_length"][spans_key] == {"ORG": [3], "GPE": [1]} assert data["spans_per_type"][spans_key] == { "ORG": [Span(ref, 3, 6, "ORG")], "GPE": [Span(ref, 5, 6, "GPE")], } assert data["sb_per_type"][spans_key] == { "ORG": {"start": [ref[2:3]], "end": [ref[6:7]]}, "GPE": {"start": [ref[4:5]], "end": [ref[6:7]]}, }
def doc_findall(doc: Doc, pattern, flags=0) -> List[Span]: return list(doc._.finditer(pattern, flags=flags)) Doc.set_extension('findall', method=doc_findall) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Span extensions for standard regex functions 'finditer' and 'findall' # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def span_finditer(span: Span, pattern, flags=0): span_start_idx = span[0].idx for m in re.finditer(pattern=pattern, string=span.text, flags=flags): start, end = m.span() start += span_start_idx end += span_start_idx yield span.doc._.idxs2span(start, end) Span.set_extension('finditer', method=span_finditer) def span_findall(span, pattern, flags=0): return list(span._.finditer(pattern, flags=flags)) Span.set_extension('findall', method=span_findall)
def process_without_overlaps(self, doc: Doc, sorted_spans: _OrderedDictItemsView, classes: OrderedDict, attributes: OrderedDict, relations: OrderedDict) -> Doc: """:arg a SpaCy Doc, can be overwriten by the subclass as needed. This function will add spans to doc.ents (defined by SpaCy as default) which doesn't allow overlapped annotations. @param doc: initiated SpaCy Doc @param sorted_spans: a sorted OrderedDict Items ( spans[entity_id] = (start, end, span_text)) @param classes: a OrderedDict to map a entity id to [entity label, [attr_ids]] @param attributes: a OrderedDict to map a attribute id to (attribute_name, attribute_value) @param relations: a OrderedDict to map a relation_id to (label, (relation_component_ids)) @return: annotated Doc """ existing_entities = list(doc.ents) new_entities = list() # token_left_bound = 0 token_right_bound = len(doc) - 1 token_start = -1 token_end = -1 for id, span_tuple in sorted_spans: # because SpaCy uses token offset instead of char offset to define Spans, we need to match them, # binary search is used here to speed up if self.store_anno_string: start, end, span_txt = span_tuple else: start, end = span_tuple # because SpaCy uses token offset instead of char offset to define Spans, we need to match them, # binary search is used here to speed up if start < doc[0].idx: # If the annotation fall into a span that is before the 1st Spacy token, adjust the span to the 1st # token token_start = 0 token_end = 1 elif token_start >= token_right_bound: # If the annotation fall into a span that is after the last Spacy token, adjust the span to the last # token token_start = token_right_bound - 1 token_end = token_right_bound else: token_start = self.find_start_token(start, token_start, token_right_bound, doc) if end >= doc[-1].idx + doc[-1].__len__(): token_end = token_right_bound + 1 else: token_end = self.find_end_token(end, token_start, token_right_bound, doc) if token_start < 0 or token_start >= token_right_bound or token_end < 0 or token_end > token_right_bound: raise ValueError( "It is likely your annotations overlapped, which process_without_overlaps doesn't support parsing " "those. You will need to initiate the EhostDocReader with 'support_overlap=True' in the arguements" ) if token_start >= 0 and token_end > 0: span = Span(doc, token_start, token_end, label=classes[id][0]) for attr_id in classes[id][1]: if attr_id not in attributes: continue attr_name = attributes[attr_id][0] attr_value = attributes[attr_id][1] setattr(span._, attr_name, attr_value) if self.store_anno_string and span_txt is not None: setattr(span._, "span_txt", span_txt) new_entities.append(span) token_start = token_end else: raise OverflowError( 'The span of the annotation: {}[{}:{}] is out of document boundary.' .format(classes[id][0], start, end)) pass doc.ents = existing_entities + new_entities return doc
span_attn_getter, span_nctokens_getter, span_wp2ncid_getter, span_wp2tokid_getter, span_wp_getter, span_wp_slice_getter, ) Doc.set_extension("wp2ncid", getter=doc_wp2ncid_getter) Doc.set_extension("nctokens", getter=doc_nctokens_getter) Doc.set_extension("tokid2nc", getter=doc_tokid2nc_getter) Doc.set_extension("wp2tokid", getter=doc_wp2tokid_getter) Doc.set_extension("tokid2ncid", getter=doc_tokid2ncid_getter) Doc.set_extension("tokid2wp", getter=doc_tokid2wp_getter) Span.set_extension("wp_slice", getter=span_wp_slice_getter) Span.set_extension("wp2tokid", getter=span_wp2tokid_getter) Span.set_extension("attention", getter=span_attn_getter) Span.set_extension("wordpieces", getter=span_wp_getter) Span.set_extension("wp2ncid", getter=span_wp2ncid_getter) Span.set_extension("nctokens", getter=span_nctokens_getter) def load_danish(spacy_model: str = "da_core_news_sm", transformer: str = "Maltehb/danish-bert-botxo"): nlp = spacy.load(spacy_model) if transformer: # add transformer # Construction via add_pipe with custom config config = {
def f(doc: Doc) -> Doc: doc.ents += (Span(doc, 3, 5, "foo"), ) return doc
def _pseudofy_side(self, rel: brat_data.Relation, sentence: Span, k: int, do_left=True) -> SentenceGenerator: rel = _make_contig_rel(rel) if not rel: return # _make_contig_rel does make a deep copy but no interesting changes have been made yet logging.info(f'Original instance: {rel}') ent, other_ent = (rel.arg1, rel.arg2) if do_left else (rel.arg2, rel.arg1) text = str(sentence) span_start = sentence.start_char start, end = adjust_spans(ent, -span_start) adjust_spans(other_ent, -span_start) try: original_pos = [t.pos_ for t in sentence.char_span(start, end)] except TypeError: # The char span doesn't line up with any tokens, # thus we can't figure out if the prediction is the right POS logging.info( 'Instance rejected; the spans given do not align with tokens according to the spaCy model' ) return None masked_sentence = text[:start] + '[MASK]' + text[end:] tokenized_sentence = self.bert_tokenizer.tokenize(masked_sentence) indexed_tokens = self.bert_tokenizer.convert_tokens_to_ids( tokenized_sentence) token_tensor = torch.tensor(indexed_tokens) mask_tensor = torch.tensor( [token != '[MASK]' for token in tokenized_sentence], dtype=torch.float) if len(token_tensor) > 512: # This is the token limit we report on, but the limit depends on the BERT model return None with torch.no_grad(): result = self.bert(token_tensor.unsqueeze(0), mask_tensor.unsqueeze(0), masked_lm_labels=None) result = result[0].squeeze(0) scores = torch.softmax(result, dim=-1) mask_index = tokenized_sentence.index('[MASK]') topk_scores, topk_indices = torch.topk(scores[mask_index, :], k, sorted=True) topk_tokens = self.bert_tokenizer.convert_ids_to_tokens(topk_indices) for token, score in zip(topk_tokens, topk_scores): new_sent = text[:start] + token + text[end:] new_doc = self.nlp(new_sent) new_span = new_doc.char_span(start, start + len(token)) if new_span is None: continue pos_match = [t.pos_ for t in new_span] == original_pos this_rel = deepcopy(rel) ent, other_ent = (this_rel.arg1, this_rel.arg2) if do_left else (this_rel.arg2, this_rel.arg1) ent.spans = [(start, start + len(token))] if ent.start < other_ent.start: # If the entity being changed comes before the one not being changed, the spans of the other must # also be adjusted; it is not guaranteed that `rel.arg1` always comes before `rel.arg2` new_offset = len(token) - len(ent.mention) adjust_spans(other_ent, new_offset) ent.mention = token new_ps = PseudoSentence(this_rel, new_sent, float(score), pos_match) logging.info(f'New instance: {new_ps}') yield new_ps
def process_support_overlaps(self, doc: Doc, sorted_spans: _OrderedDictItemsView, classes: OrderedDict, attributes: OrderedDict, relations: OrderedDict) -> Doc: """:arg a SpaCy Doc, can be overwriten by the subclass as needed. This function will add spans to doc._.concepts (defined in 'read' function above, which allows overlapped annotations. @param doc: initiated SpaCy Doc @param sorted_spans: a sorted OrderedDict Items ( spans[entity_id] = (start, end, span_text)) @param classes: a OrderedDict to map a entity id to [entity label, [attr_ids]] @param attributes: a OrderedDict to map a attribute id to (attribute_name, attribute_value) @param relations: a OrderedDict to map a relation_id to (label, (relation_component_ids)) @return: annotated Doc """ existing_concepts: dict = doc._.concepts # token_left_bound = 0 previous_abs_end = 0 token_right_bound = len(doc) - 1 token_start = -1 token_end = -1 for id, span_tuple in sorted_spans: # because SpaCy uses token offset instead of char offset to define Spans, we need to match them, # binary search is used here to speed up if self.store_anno_string: start, end, span_txt = span_tuple else: start, end = span_tuple if start < doc[0].idx: # If the annotation fall into a span that is before the 1st Spacy token, adjust the span to the 1st # token token_start = 0 token_end = 1 elif token_start >= token_right_bound: # If the annotation fall into a span that is after the last Spacy token, adjust the span to the last # token self.logger.debug( "token_start {} >= token_right_bound {}".format( token_start, token_right_bound)) token_start = token_right_bound token_end = token_right_bound + 1 else: # if start < previous_abs_end: # self.logger.debug("To find {} between token_start - 1({}[{}]) and token_right_bound({}[{}])" # .format(start, token_start-1, doc[token_start-1].idx, # token_right_bound, doc[token_right_bound].idx), ) # token_start = self.find_start_token(start, token_start - 1 if token_start > 0 else 0, # token_right_bound, doc) # self.logger.debug('\tfind token_start={}[{}]'.format(token_start, doc[token_start].idx)) # # else: self.logger.debug( "To find {} between token_start ({}[{}]) and token_right_bound({}[{}])" .format(start, token_start, doc[token_start].idx, token_right_bound, doc[token_right_bound].idx), ) token_start = self.find_start_token(start, token_start, token_right_bound, doc) self.logger.debug("\tfind start token {}('{}')".format( token_start, doc[token_start])) if end >= doc[-1].idx + doc[-1].__len__(): self.logger.debug( "end ({}) >= doc[-1].idx ({}) + doc[-1].__len__() ({})" .format(end, doc[-1].idx, doc[-1].__len__())) token_end = token_right_bound + 1 else: self.logger.debug( "To find token_end starts from {} between token_start ({}[{}]) and token_right_bound({}[{}])" .format(end, token_start, doc[token_start].idx, token_right_bound, doc[token_right_bound].idx)) token_end = self.find_end_token(end, token_start, token_right_bound, doc) self.logger.debug("\tFind end token {}('{}')".format( token_end, doc[token_end])) if token_start >= 0 and token_end > 0: span = Span(doc, token_start, token_end, label=classes[id][0]) if self.logger.isEnabledFor(logging.DEBUG): import re if re.sub('\s+', ' ', span._.span_txt) != re.sub( '\s+', ' ', str(span)): self.logger.debug('{}[{}:{}]\n\t{}<>\n\t{}<>'.format( classes[id][0], token_start, token_end, re.sub('\s+', ' ', span._.span_txt), re.sub('\s+', ' ', str(span)))) for attr_id in classes[id][1]: if attr_id not in attributes: continue attr_name = attributes[attr_id][0] attr_value = attributes[attr_id][1] setattr(span._, attr_name, attr_value) if self.store_anno_string and span_txt is not None: setattr(span._, "span_txt", span_txt) if classes[id][0] not in existing_concepts: existing_concepts[classes[id][0]] = list() existing_concepts[classes[id][0]].append(span) # token_start = token_end previous_abs_end = token_start else: raise OverflowError( 'The span of the annotation: {}[{}:{}] is out of document boundary.' .format(classes[id][0], start, end)) pass return doc
#print("*) PRINT Spacy NER Formatted string - 02") #for i in range(0,len(ner_ita_spacy_doc.ents)): # ner_ita_phrase_01_ner_formatted = ner_ita_phrase_01_ner_formatted.replace(ner_ita_spacy_doc.ents[i].text, ner_ita_spacy_doc.ents[i].text+"("+ner_ita_spacy_doc.ents[i].label_+")") print("*) Spacy ENTITY CORRECTION - ITA") ''' Obtaing hash of labels with spacy ''' money_tag = ner_ita_spacy_doc.vocab.strings["MONEY"] print(money_tag) print("*) Original phrase") print(ner_ita_phrase_01) print(ner_ita_spacy_doc.ents) ## IMPO: SPACY DEFINE AN ADHOC NER ENTITY - MONEY it_money_ner = Span(ner_ita_spacy_doc, 14, 18, label=money_tag) print(it_money_ner) ner_ita_spacy_doc.ents = list(ner_ita_spacy_doc.ents) + [it_money_ner] print(ner_ita_spacy_doc.ents) ner_ita_phrase_01_ner_formatted = ner_ita_phrase_01 #for i in range(0,len(ner_ita_spacy_doc.ents)): # ner_ita_phrase_01_ner_formatted = ner_ita_phrase_01_ner_formatted.replace(ner_ita_spacy_doc.ents[i].text, ner_ita_spacy_doc.ents[i].text+"("+ner_ita_spacy_doc.ents[i].label_+")") ner_ita_phrase_01_ner_formatted = ner_ita_phrase_01 for i in range(0, len(ner_ita_spacy_doc.ents)): ner_ita_phrase_01_ner_formatted = ner_ita_phrase_01_ner_formatted.replace( ner_ita_spacy_doc.ents[i].text, ner_ita_spacy_doc.ents[i].text + "(" + ner_ita_spacy_doc.ents[i].label_ + ")") print("*) PRINT Spacy NER Formatted string - 01 - + new ENTITY - MONEY") print(ner_ita_phrase_01_ner_formatted)
import ahocorasick import spacy import textspan from spacy.tokens import Doc from spacy.tokens.span import Span from spacy.util import filter_spans from typing_extensions import Literal from camphr.utils import SerializationMixin, get_doc_char_span # Sometimes matched text is different from original text # since `PatternSearcher` can match the `lemma`. # This extension holds the matched text. PATTERN_MATCH_AS = "pattern_match_as" Span.set_extension(PATTERN_MATCH_AS, default=None, force=True) @spacy.component("pattern_searcher") class PatternSearcher(SerializationMixin): serialization_fields = [ "model", "label_type", "custom_label", "custom_label_map", "destructive", "lemma", "lower", "cfg", "normalizer", ]
def __call__(self, doc): #ToDo Could use refactoring for readability and structure, as it's too complex now full_name_spans = [] min_span_size = 2 max_span_size = 2 is_first_char_capped_list = [ doc[i].text[0].isupper() for i in range(len(doc)) ] i = 0 while i < len(doc) - 1: is_first_name_this = doc[i]._.get(self.first_name_extension_name) is_last_name_next = doc[i + 1]._.get(self.last_name_extension_name) both_capped = is_first_char_capped_list[ i] and is_first_char_capped_list[i + 1] if is_first_name_this and is_last_name_next and both_capped: doc[i]._.set(self.token_extension_name, self.ANOT_INIT) doc[i + 1]._.set(self.token_extension_name, self.ANOT_OTHER) new_span = Span(doc, i, i + 1, label=self.SPAN_LABEL) span_start = i span_end = i + 1 #look-back for more first names first_first_name = i while first_first_name-1>= 0 and doc[first_first_name-1]._.get(self.first_name_extension_name)\ and is_first_char_capped_list[first_first_name-1]: first_first_name -= 1 span_start = first_first_name if first_first_name != i: #name starts earlier doc[i]._.set(self.token_extension_name, self.ANOT_OTHER) doc[first_first_name]._.set(self.token_extension_name, self.ANOT_INIT) for j in range(first_first_name + 1, i): doc[first_first_name]._.set(self.token_extension_name, self.ANOT_OTHER) # Check if considering the last name as a first name would still yield a valid match # when adding the following word look_ahead_counter = 1 while (i+1+look_ahead_counter) < len(doc) \ and doc[i+look_ahead_counter]._.get(self.first_name_extension_name) \ and doc[i+1+look_ahead_counter]._.get(self.last_name_extension_name)\ and is_first_char_capped_list[i+1+look_ahead_counter]: doc[i + 1 + look_ahead_counter]._.set( self.token_extension_name, self.ANOT_OTHER) look_ahead_counter += 1 span_end += 1 #Check if it can be extended with more last names while (span_end + 1 < len(doc) and doc[span_end + 1]._.get( self.last_name_extension_name) and is_first_char_capped_list[span_end + 1]): doc[span_end + 1]._.set(self.token_extension_name, self.ANOT_OTHER) span_end += 1 full_name_spans.append(new_span) i = span_end + 1 else: i += 1 doc._.set(self.doc_extension_name, full_name_spans) return doc
def __init__(self, txt_path): super(ASK, self).__init__() f_path = txt_path with open(f_path, 'r') as f: txt = f.readlines() txt = [x.strip() for x in txt] sentence_regex = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s" regex = re.compile(sentence_regex) txt = [removeParentheses(x) for x in txt if regex.search(x)] txt = ''.join(str(elem) for elem in txt) self.txt = txt nlp = spacy.load('en') doc = nlp(unicode(txt)) questions = [] words = [] for word in doc: if word.pos_ == "NOUN" or word.pos_ == "VERB": words.append(word.lemma_) count = Counter(words) for sent in doc.sents: rel = 0 for word in sent: rel += count.get(word.lemma_, 0) rel = rel/float(len(sent)) if sent[0].text == "This": remain = Span(doc, sent.start+1, sent.end-1) questions.append(("What " + remain.text + "?", rel+10)) # What questions if (sent.root.lemma_ == "be"): for r in sent.root.rights: join = "" first = True for w in r.subtree: if first and w.ent_type_ == "": join += ' ' + w.lower_ else: first = False join += ' ' + w.text if join[-1] == ",": join = join[:-2] what_question_1 = "What " + sent.root.text + " " + join + "?" questions.append((what_question_1, rel)) break for r in sent.root.lefts: join = "" first = True for w in r.subtree: if first and w.ent_type_ == "": join += ' ' + w.lower_ else: first = False join += ' ' + w.text if join[-1] == ",": join = join[:-2] what_question_2 = "What " + sent.root.text + " " + join + "?" questions.append((what_question_2, rel)) # Who questions subject = ["he", "she"] for i in range(0, len(sent)-1) : if sent[i].dep_ == "nsubj" and sent[i].ent_type_ == "PERSON" or sent[i].text in subject: if i > 0: start = Span(doc, sent.start, i+sent.start-1) else: start = Span(doc, sent.start, sent.start) i = i+1 while i < len(sent)-1: if sent[i].ent_type_ == "PERSON": i = i+1 elif sent[i].dep_ == "nsubj": i = i+1 else: break end = Span(doc, i + sent.start, sent.end-1) if (len(start) == 0): who_question = "Who " + end.text + "?" questions.append((who_question, rel)) else: who_question = "Who " + end.text + "?" questions.append((who_question, rel)) break # When questions # Only works when original sentence is "In ____, blah blah." for i in range(0, len(sent)-1): if (sent[i].ent_type_ == "DATE" and sent[i].pos_ != "ADJ"): hi = Span(doc, sent.start, i+sent.start) head = sent[i].head while i < len(sent) - 1 and (sent[i].ent_type_ == "DATE" or sent[i].pos_ == "PUNCT"): i = i+1 end = Span(doc, i + sent.start, sent.end-1) verb = sent[i] for t in sent.root.lefts: verb = t if verb.lemma_ == "be": final = "When was " else: final = "When did " if len(end) > 0 and (end[0].pos_ == "NOUN" or end[0].pos_ == "DET"): for token in end: if verb.lemma_ == "be" and token.lemma_ == "be": final = final elif verb.lemma_ != "be" and token == sent.root: final = final + sent.root.lemma_ + " " else: final = final + token.orth_ + " " when_question_1 = final[:-1] + "?" questions.append((when_question_1, rel)) break for i in range(0, len(sent)-1): if (sent[i].ent_type_ == "DATE"): if (i > 0): front = Span(doc, sent.start, i+sent.start-1) else: font = [] valid = False while i < len(sent) - 1 and (sent[i].ent_type_ == "DATE" or sent[i].pos_ == "PUNCT"): i = i+1 valid = True if valid: end = Span(doc, sent.start + i, sent.end-1) verb = sent[i] for t in sent.root.lefts: verb = t if verb.lemma_ == "be": final = "When was " else: final = "When did " if len(end) > 0 and (end[0].pos_ == "NOUN" or end[0].pos_ == "DET"): for token in front: if verb.lemma_ == "be" and token.lemma_ == "be": final = final elif verb.lemma_ != "be" and token == sent.root: final = final + sent.root.lemma_ + " " elif token == front[0] and token.ent_type_ == "": final = final + token.lower_ + " " else: final = final + token.orth_ + " " for token in end: if verb.lemma_ == "be" and token.lemma_ == "be": final = final elif verb.lemma_ != "be" and token == sent.root: final = final + sent.root.lemma_ + " " else: final = final + token.orth_ + " " when_question_2 = final[:-1] + "?" questions.append((when_question_2, rel)) break # Where questions wheretag = ["GPE", "LOC", "FACILTY", "ORG"] for i in range(0, len(sent)-1): if (sent[i].ent_type_ in wheretag) and (sent[i-1].ent_type_ not in wheretag) and (sent[i-1].tag_ == "IN"): oneloc = " " + str(sent[i-1])+ " " + str(sent[i]) j = i while j < len(sent)-1 and sent[j+1].ent_type_ in wheretag: oneloc += " " + str(sent[j+1]) j = j + 1 i = j where_question1 = "" where_question2 = "" for k in range(0, len(sent)-1): if sent[k] == sent.root and (sent[k-1].pos_ == 'VERB'): where_question1 = "Where" + " " + two_verbs(k, sent, doc)[0] where_question2 = "Where" + " " + two_verbs(k, sent, doc)[1] elif sent[k] == sent.root: where_question1 = "Where" + " " + one_verb(k, sent, doc)[0] where_question2 = "Where" + " " + one_verb(k, sent, doc)[1] where_question1 = where_question1.replace(oneloc, "") where_question2 = where_question2.replace(oneloc, "") questions.append((where_question1, rel)) questions.append((where_question2, rel)) break # DO/ DID DOES HAVE WAS IS questions for i in range(0, len(sent)-1): vquest1 = "" vquest2 = "" if sent[i] == sent.root and (sent[i-1].pos_ == 'VERB'): vquest1 = (two_verbs(i, sent, doc)[0]) vquest1 = vquest1[0].capitalize() + vquest1[1:] vquest2 = (two_verbs(i, sent, doc)[1]) vquest2 = vquest2[0].capitalize() + vquest2[1:] questions.append((vquest1, rel)) questions.append((vquest2, rel)) break elif sent[i] == sent.root: vquest1 = (one_verb(i, sent, doc)[0]) vquest1 = vquest1[0].capitalize() + vquest1[1:] vquest2 = (one_verb(i, sent, doc)[1]) vquest2 = vquest2[0].capitalize() + vquest2[1:] questions.append((vquest1, rel)) questions.append((vquest2, rel)) break pronouns = ["he", "she", "his", "her"] questions = sorted(questions, key=lambda x: x[1], reverse=True) goodQuestions = [] goodPronouns = [] for q in questions: if (q[0].count(' ') > 3): sentence = q[0] try: matches = tool.check(sentence) if len(matches)==0: words = q[0].split() if not any(p == w for w in words for p in pronouns): goodQuestions.append(sentence) else: goodPronouns.append(sentence) except: words = q[0].split() if not any(p == w for w in words for p in pronouns): goodQuestions.append(sentence) else: goodPronouns.append(sentence) count = 0 for i in range(0, 4): for j in range(0, len(goodQuestions)/4): if count < qNum: count = count+1 print goodQuestions[j*4+i] for i in range(0, 4): for j in range(0, len(goodPronouns)/4): if count < qNum: count = count+1 print goodPronouns[j*4+i]
def __init__(self, txt_path): super(NER, self).__init__() cur_path = os.path.dirname(__file__) rel_path = '../data/' + txt_path f_path = os.path.join(cur_path, rel_path) with open(f_path, 'r') as f: txt = f.readlines() txt = [x.strip() for x in txt] sentence_regex = "(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s" regex = re.compile(sentence_regex) txt = [x for x in txt if regex.search(x)] txt = ''.join(str(elem) for elem in txt) self.txt = txt nlp = spacy.load('en') doc = nlp(unicode(txt)) questions = [] when = [] words = [] for word in doc: if word.pos_ == "NOUN" or word.pos_ == "VERB": words.append(word.lemma_) count = Counter(words) for sent in doc.sents: rel = 0 for word in sent: rel += count.get(word.lemma_, 0) rel = rel/float(len(sent)) # What questions if (sent.root.lemma_ == "be"): for r in sent.root.rights: join = ' '.join(w.text for w in r.subtree) if join[-1] == ",": join = join[:-2] what_question_1 = "What " + sent.root.text + " " + join + "?" questions.append((format_question(what_question_1), rel+gscore(what_question_1))) break for r in sent.root.lefts: join = ' '.join(w.text for w in r.subtree) if join[-1] == ",": join = join[:-2] what_question_2 = "What " + sent.root.text + " " + join + "?" questions.append((format_question(what_question_2), rel+gscore(what_question_2))) # Who questions subject = ["he", "she"] for i in range(0, len(sent)-1) : if sent[i].dep_ == "nsubj" and sent[i].ent_type_ == "PERSON" or sent[i].text in subject: if i > 0: start = Span(doc, sent.start, i+sent.start-1) else: start = Span(doc, sent.start, sent.start) i = i+1 while i < len(sent)-1: if sent[i].ent_type_ == "PERSON": i = i+1 elif sent[i].dep_ == "nsubj": i = i+1 else: break end = Span(doc, i + sent.start, sent.end-1) if (len(start) == 0): # print "Who " + end.text + "?" who_question = "Who " + end.text + "?" questions.append((format_question(who_question), rel+gscore(who_question))) else: # print start.text + " who " + end.text + "?" who_question = start.text + " who " + end.text + "?" questions.append((format_question(who_question), rel+gscore(who_question))) break # When questions # Only works when original sentence is "In ____, blah blah." for i in range(0, len(sent)-1): if (sent[i].ent_type_ == "DATE" and sent[i].pos_ != "ADJ"): hi = Span(doc, sent.start, i+sent.start) head = sent[i].head while i < len(sent) - 1 and (sent[i].ent_type_ == "DATE" or sent[i].pos_ == "PUNCT"): i = i+1 end = Span(doc, i + sent.start, sent.end-1) verb = sent[i] for t in sent.root.lefts: verb = t if verb.lemma_ == "be": final = "When was " else: final = "When did " for token in end: if verb.lemma_ == "be" and token.lemma_ == "be": final = final elif verb.lemma_ != "be" and token == sent.root: final = final + sent.root.lemma_ + " " else: final = final + token.orth_ + " " # print final[:-1] + "?" when_question_1 = final[:-1] + "?" questions.append((when_question_1, rel+gscore(when_question_1))) when.append((when_question_1, rel+gscore(when_question_1))) break for i in range(0, len(sent)-1): if (sent[i].ent_type_ == "DATE"): if (i > 0): front = Span(doc, sent.start, i+sent.start-1) else: font = [] valid = False while i < len(sent) - 1 and (sent[i].ent_type_ == "DATE" or sent[i].pos_ == "PUNCT"): i = i+1 valid = True if valid: end = Span(doc, sent.start + i, sent.end-1) verb = sent[i] for t in sent.root.lefts: verb = t if verb.lemma_ == "be": final = "When was " else: final = "When did " for token in front: if token.ent_type_ == "": final = final + token.lower_ + " " else: final = final + token.text + " " for token in end: if verb.lemma_ == "be" and token.lemma_ == "be": final = final elif verb.lemma_ != "be" and token == sent.root: final = final + sent.root.lemma_ + " " else: final = final + token.orth_ + " " # print final[:-1] + "?" when_question_2 = final[:-1] + "?" questions.append((when_question_2, rel+gscore(when_question_2))) when.append((when_question_2, rel+gscore(when_question_2))) break # Where questions for i in range(0, len(sent)-1): # if (sent[i].ent_type_ == "GPE" and sent[i-1].ent_type_ != "GPE"): if (sent[i].ent_type_ == "GPE" and sent[i-1].ent_type_ != "GPE" and sent[i-1].tag_ == "IN"): # print sent oneloc = str(sent[i]) j = i while j < len(sent)-1 and sent[j+1].ent_type_ == "GPE": oneloc += " " + str(sent[j+1]) j = j + 1 # possible_locations.append(oneloc) i = j for r in sent.root.rights: if sent.root.lemma_ == "be": final = "Where was " elif sent.root.tag_ == "VB": final = "Where do " elif sent.root.tag_ == "VBP": final = "Where have " else: final = "Where did " subject = '' for l in sent.root.lefts: if l.right_edge.dep_=='nsubj': subject = str(l.right_edge) break else: subject = ' '.join(w.text for w in l.subtree) final += subject + " " + sent.root.lemma_ + " " + ' '.join(w.text for w in r.subtree) + "?" break final = final.replace(oneloc, "") where_question = final[:-1] + "?" questions.append((where_question, rel+gscore(where_question))) # print "" break # DO/ DID DOES HAVE questions if (sent.root.pos_ == "VERB"): # print sent.root.tag_ v_question = "" for r in sent.root.rights: if sent.root.tag_ == 'VB': v_question = 'Do' elif sent.root.tag_ == 'VBD': v_question = 'Did' elif sent.root.tag_ == 'VBZ': v_question = 'Does' elif sent.root.tag_ == 'VBP': v_question = 'Have' elif sent.root.tag_ == 'VBN': break else: break subject ='' for l in sent.root.lefts: if l.right_edge.dep_=='nsubj': subject = str(l.right_edge) break elif v_question == 'Have': subject = ' '.join(w.text for w in l.subtree) break else: subject = ' '.join(w.text for w in l.subtree) v_question += " " + subject + " " + sent.root.lemma_ + " " +' '.join(w.text for w in r.subtree) + '?' break # print v_question questions.append((v_question, rel+gscore(v_question))) # print "" # Is Was Were Questions if (sent.root.lemma_ == "be"): for r in sent.root.rights: if sent.root.text == '\'s': isquest1 == 'Is' else: isquest1 = sent.root.text.capitalize() for l in sent.root.lefts: isquest1 += " " + ' '.join(w.text for w in l.subtree) isquest1 += ' ' +' '.join(w.text for w in r.subtree) +'?' break # print isquest1 questions.append((isquest1, rel+gscore(isquest1))) # print "" questions = sorted(questions, key=lambda x: x[1], reverse=True) goodQuestions = [] for q in questions: if q[0].count(' ') > 3: sentence = q[0] matches = tool.check(sentence) if len(matches) == 0: goodQuestions.append(q[0]) for i in range(0, qNum): if (i*4) < len(goodQuestions) print goodQuestions[i*4]