def pysbd_sentencizer(doc: Doc) -> Doc: """Adds sentence boundaries to a Doc. Intended to be used as a pipe in a spaCy pipeline. Uses https://github.com/nipunsadvilkar/pySBD to get proper sentence and respective char_spans Handle special cases: New lines cannot be end of sentence tokens. New lines that separate sentences will be added to the beginning of the next sentence. @param doc: the spaCy document to be annotated with sentence boundaries """ segmenter = pysbd.Segmenter(language="en", clean=False, char_span=True) sents_char_spans: List[TextSpan] = segmenter.segment(doc.text) char_spans = [ doc.char_span(sent_span.start, sent_span.end) for sent_span in sents_char_spans ] start_token_char_offsets = [span[0].idx for span in char_spans if span is not None] for token in doc: prev_token = token.nbor(-1) if token.i != 0 else None if token.idx in start_token_char_offsets: if prev_token and prev_token.text in ABBREVIATIONS: token.is_sent_start = False else: token.is_sent_start = True # check if previous token contains more than 2 newline chars elif prev_token and prev_token.i != 0 and prev_token.text.count("\n") >= 2: token.is_sent_start = True else: token.is_sent_start = False return doc
def convert_file( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) header = True with open(input_path, "r") as in_f, open(output_path, "w") as out_f: for line in tqdm(in_f): if header: header = False continue sentence, tokens = pd.read_csv(StringIO(line), header=None, usecols=[0, 1]).values[0] tokens = eval(tokens) dict_line = line_to_dict(sentence, tokens) eg = dict_line if eg["answer"] != "accept": continue tokens = [token["text"] for token in eg["tokens"]] words, spaces = get_words_and_spaces(tokens, eg["text"]) doc = Doc(nlp.vocab, words=words, spaces=spaces) doc.ents = [ doc.char_span(s["start"], s["end"], label=s["label"]) for s in eg.get("spans", []) ] doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path}")
def _mk_spacy_doc(tokens, entities): nlp = spacy.blank("en") doc = Doc(nlp.vocab, words=tokens, spaces=[True for _ in tokens]) for ent in entities: span = doc.char_span(ent["start"], ent["end"], label=ent["entity"]) doc.ents = list(doc.ents) + [span] return doc
def get_doc_char_span( doc: Doc, i: int, j: int, destructive: bool = True, **kwargs ) -> Optional[Span]: """Get Span from Doc with char position, similar to doc.char_span. Args: i: The index of the first character of the span j: The index of the first character after the span destructive: If True, tokens in [i,j) will be splitted and make sure to return span. kwargs: passed to Doc.char_span """ span = doc.char_span(i, j, **kwargs) if not span and destructive: destruct_token(doc, i, j) span = doc.char_span(i, j, **kwargs) return span
def get_doc_char_span( doc: Doc, i: int, j: int, destructive: bool = True, covering: bool = False, **kwargs ) -> Optional[Span]: """Get Span from Doc with char position, similar to doc.char_span. Args: i: The index of the first character of the span j: The index of the first character after the span destructive: If True, tokens in [i,j) will be splitted and make sure to return span. covering: If True, [i,j) will be adjusted to match the existing token boundaries. It precedes `destructive`. kwargs: passed to Doc.char_span """ span = doc.char_span(i, j, **kwargs) if not span and covering: span = _get_covering_span(doc, i, j, **kwargs) if not span and destructive: destruct_token(doc, i, j) span = doc.char_span(i, j, **kwargs) return span
def _mk_spacy_doc(tokens, entities): nlp = spacy.blank("en") doc = Doc(nlp.vocab, words=tokens, spaces=[True for _ in tokens]) # This is a checking mechanism. Rasa allows for overlapping intents. # spaCy totally does not do that. taken = [] warn = False for ent in entities: if (ent["start"], ent["end"]) not in taken: span = doc.char_span(ent["start"], ent["end"], label=ent["entity"]) doc.ents = list(doc.ents) + [span] taken.append((ent["start"], ent["end"])) else: warn = True return doc, warn
def find_spans(self, doc: Doc) -> Iterable[Tuple[int, int, str]]: """Runs the parser on the spacy document, and convert the result to labels.""" text = doc.text # The current version of Snips has a bug that makes it crash with some rare # Turkish characters, or mentions of "billion years" text = text.replace("’", "'").replace("”", "\"").replace("“", "\"").replace("—", "-") text = text.encode("iso-8859-15", "ignore").decode("iso-8859-15") text = re.sub( "(\\d+) ([bm]illion(?: (?:\\d+|one|two|three|four|five|six|seven" + "|eight|nine|ten))? years?)", "\\g<1>.0 \\g<2>", text) results = self.parser.parse(text) for result in results: span = doc.char_span(result["range"]["start"], result["range"]["end"]) if span is None or span.text.lower() in {"now" } or span.text in {"may"}: continue label = None if (result["entity_kind"] == "snips/number" and span.text.lower() not in {"one", "some", "few", "many", "several"}): label = "CARDINAL" elif (result["entity_kind"] == "snips/ordinal" and span.text.lower() not in {"first", "second", "the first", "the second"}): label = "ORDINAL" elif result["entity_kind"] == "snips/temperature": label = "QUANTITY" elif result["entity_kind"] == "snips/amountOfMoney": label = "MONEY" elif result["entity_kind"] == "snips/percentage": label = "PERCENT" elif result["entity_kind"] in { "snips/date", "snips/datePeriod", "snips/datetime" }: label = "DATE" elif result["entity_kind"] in {"snips/time", "snips/timePeriod"}: label = "TIME" if label: yield span.start, span.end, label
def __call__(self, text: str) -> Doc: dtokens = self.detailed_tokens(text) words = [x.surface for x in dtokens] spaces = [x.space for x in dtokens] doc = Doc(self.vocab, words=words, spaces=spaces) for token, dtoken in zip(doc, dtokens): token.tag_ = dtoken.pos token.lemma_ = dtoken.lemma if dtoken.lemma != "*" else token.text token._.set(self.key_fstring, dtoken.fstring) with doc.retokenize() as retokenizer: for match in RE_URL.finditer(doc.text): span = doc.char_span(*match.span()) if span: retokenizer.merge(span) doc.is_tagged = True return doc
def convert_file( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) for eg in tqdm(srsly.read_jsonl(input_path)): if eg["answer"] != "accept": continue tokens = [token["text"] for token in eg["tokens"]] words, spaces = get_words_and_spaces(tokens, eg["text"]) doc = Doc(nlp.vocab, words=words, spaces=spaces) doc.ents = [ doc.char_span(s["start"], s["end"], label=s["label"]) for s in eg.get("spans", []) ] doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path.name}")
def main(input_path: Path = typer.Argument(..., exists=True, dir_okay=False)): print("Read params.yaml...") with open("params.yaml", "r") as fd: params = yaml.safe_load(fd) dev_size = params["train"]["corpora"]["dev_size"] shuffle_seed = params["train"]["corpora"]["shuffle_seed"] print(f"...read dev_size={dev_size}, shuffle_seed={shuffle_seed}") print("Read annotations...") corpus = list(srsly.read_jsonl(input_path)) print(f"...read {len(corpus)} texts") print("Convert into documents...") docs = [] nlp = spacy.blank("en") for eg in corpus: if eg["answer"] != "accept": continue tokens = [token["text"] for token in eg["tokens"]] words, spaces = get_words_and_spaces(tokens, eg["text"]) doc = Doc(nlp.vocab, words=words, spaces=spaces) doc.ents = [ doc.char_span(s["start"], s["end"], label=s["label"]) for s in eg.get("spans", []) ] docs.append(doc) print(f"...converted {len(docs)} documents") print("Split into train and dev...") train, dev = train_test_split(docs, test_size=dev_size, random_state=shuffle_seed, shuffle=True) print(f"...split into {len(train)} train and {len(dev)} dev documents") print("Write serialized documents...") for split, data in [("train", train), ("dev", dev)]: output_path = input_path.with_suffix(f".{split}.spacy") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"], docs=data) doc_bin.to_disk(output_path) print(f"...wrote {output_path}")
def main(json_loc: Path, train_file: Path, dev_file: Path, test_file: Path, test_split=0.189, train_split=0.709): """Creating the corpus from the Prodigy annotations.""" Doc.set_extension("rel", default={}) vocab = Vocab() docs = {"train": [], "dev": [], "test": []} ids = {"train": set(), "dev": set(), "test": set()} count_all = {"train": 0, "dev": 0, "test": 0} count_pos = {"train": 0, "dev": 0, "test": 0} long_rel_count = 0 #how many relations are longer error_count_rel = 0 #how often is something different than ARGO, ARG1, ARG with json_loc.open("r", encoding="utf8") as jsonfile: length_training_data = len([ True for line in jsonfile if json.loads(line)["answer"] == "accept" ]) msg.info(f"Number of accepted recipes: {length_training_data}") with json_loc.open("r", encoding="utf8") as jsonfile: for line in jsonfile: example = json.loads(line) #one recipe span_starts = set() if example["answer"] == "accept": neg = 0 pos = 0 try: # Parse the tokens -> example["tokens"] = list of dicts words = [t["text"] for t in example["tokens"] ] #list containing all words spaces = [ t["ws"] for t in example["tokens"] ] #list containing ws is behind word (ws = True/False) doc = Doc(vocab, words=words, spaces=spaces) # Parse the entities spans = example[ "spans"] #list of dicts containing entities entities = [] span_end_to_start = {} ents_dict = {} for span in spans: #every detected span entity = doc.char_span( span["start"], span["end"], label=span["label"] ) #"start" = wievielter character ist start character des spans im doc span_end_to_start[span["token_end"]] = span[ "token_start"] #end_token of span as key for start_token (start token = wievielter token in doc) entities.append(entity) #appended to list span_starts.add(span["token_start"]) #added to set ents_dict[span["token_start"]] = (span["label"], span["token_start"]) doc.ents = entities #entity list assigned as doc entites # Parse the relations rels = {} # create token combinations for x1 in span_starts: #VERBS_TO_OTHER 1a if VERBS_TO_OTHER == True: if ents_dict[x1][0] == "V": #filter entity type for x2 in span_starts: if ents_dict[x2][0] in [ "Z", "TOOL", "ATTR", "TEMP", "DAUER", "ZEITP", "PRÄP" ]: #filter entity type #DIFF_FRONT_BACK 1a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): rels[(x1, x2)] = {} else: pass #DIFF_FRONT_BACK 1b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) rels[(x1, x2)] = { } #every possible span combination becomes key for individual dict (1,1), (1,2) ... #VERBS_TO_OTHER 1b else: for x2 in span_starts: #DIFF_FRONT_BACK 2a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): rels[(x1, x2)] = {} else: pass #DIFF_FRONT_BACK 2b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) rels[(x1, x2)] = { } #every possible span combination becomes key for individual dict (1,1), (1,2) ... relations = example[ "relations"] #relations is list of dict for relation in relations: # the 'head' and 'child' annotations refer to the end token in the span # but we want the first token start = span_end_to_start[relation[ "head"]] #wievielter token ist start token des head end = span_end_to_start[relation[ "child"]] #wievielter token ist start token des child label = relation["label"] #DETAILED_ARGS 1a if DETAILED_ARGS == True: if label == "ARG0": if ents_dict[end][0] not in ["Z", "TOOL"]: label = MAP_LABELS_ARG[ents_dict[end][0]] else: label = MAP_LABELS_ARG0[ents_dict[end][ 0]] #assign new label based on span type elif label == "ARG1": if ents_dict[end][0] not in ["Z", "TOOL"]: label = MAP_LABELS_ARG[ents_dict[end][0]] else: label = MAP_LABELS_ARG1[ents_dict[end][0]] elif label == "ARG": if ents_dict[end][0] in ["Z", "TOOL"]: if ents_dict[end][0] == "Z": label = "Arg0Z" elif ents_dict[end][0] == "TOOL": label = "Arg1Tool" else: label = MAP_LABELS_ARG[ents_dict[end][0]] else: error_count_rel += 1 #DETAILED_ARGS 1b else: label = MAP_LABELS_STANDARD[ label] #MAP_LABELS = dict containing label as key # Positive relations are being added try: if label not in rels[( start, end )]: #check if label already exists for token combination rels[( start, end )][label] = 1.0 #initialize label as new key with value 1.0 pos += 1 #positive case except: long_rel_count += 1 #error only if relation exists in annotation but isn't a valid token combi (too long/not starting from verb) pass # The annotation is complete, so fill in zero's where the data is missing for x1 in span_starts: #VERBS_TO_OTHER 2a if VERBS_TO_OTHER == True: if ents_dict[x1][0] == "V": #filter entity type for x2 in span_starts: if ents_dict[x2][0] in [ "Z", "TOOL", "ATTR", "TEMP", "DAUER", "ZEITP", "PRÄP" ]: #filter entity type #DIFF_FRONT_BACK 2a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): #DETAILED_ARGS 2a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values( )) + list( MAP_LABELS_ARG1. values()) + list( MAP_LABELS_ARG. values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #DETAILED_ARGS 2b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #DIFF_FRONT_BACK 2b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) #DETAILED_ARGS 3a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values( )) + list( MAP_LABELS_ARG1. values()) + list( MAP_LABELS_ARG. values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #DETAILED_ARGS 3b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #span combination with label as key gets 0 as value #VERBS_TO_OTHER 2b else: for x2 in span_starts: #DIFF_FRONT_BACK 3a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): #DETAILED_ARGS 4a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values() ) + list(MAP_LABELS_ARG1.values( )) + list(MAP_LABELS_ARG.values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #DETAILED_ARGS 4b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #DIFF_FRONT_BACK 3b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) #DETAILED_ARGS 5a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values() ) + list(MAP_LABELS_ARG1.values( )) + list(MAP_LABELS_ARG.values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #DETAILED_ARGS 5b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #print(rels) doc._.rel = rels # rels = {(1,1): {Arg0 : 1, Arg1 : 0, Arg : 0}, (1,2): {Arg0 : 0, ...}} # only keeping documents with at least 1 positive case (if doc isn't annotated relations = empty list) if pos > 0: recipe_id = example["_input_hash"] if len(docs["train"]) < round( train_split * length_training_data): ids["train"].add(recipe_id) docs["train"].append(doc) count_pos["train"] += pos count_all["train"] += pos + neg elif len(docs["test"]) < round( test_split * length_training_data): ids["test"].add(recipe_id) docs["test"].append(doc) count_pos["test"] += pos count_all["test"] += pos + neg else: ids["dev"].add(recipe_id) docs["dev"].append(doc) count_pos["dev"] += pos count_all["dev"] += pos + neg except KeyError as e: msg.fail( f"Skipping doc because of key error: {e} in {example['_input_hash']}" ) msg.info( f"{long_rel_count} relations have been cut because tokens are too far apart." ) docbin = DocBin(docs=docs["train"], store_user_data=True) docbin.to_disk(train_file) msg.info( f"{len(docs['train'])} training recipes from {len(ids['train'])} unique recipes, " f"{count_pos['train']}/{count_all['train']} pos instances.") docbin = DocBin(docs=docs["dev"], store_user_data=True) docbin.to_disk(dev_file) msg.info( f"{len(docs['dev'])} dev recipes from {len(ids['dev'])} unique recipes, " f"{count_pos['dev']}/{count_all['dev']} pos instances.") docbin = DocBin(docs=docs["test"], store_user_data=True) docbin.to_disk(test_file) msg.info( f"{len(docs['test'])} test recipes from {len(ids['test'])} unique recipes, " f"{count_pos['test']}/{count_all['test']} pos instances.")
def __call__(self, text): """Convert a Stanza Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ if not text: return Doc(self.vocab) elif text.isspace(): return Doc(self.vocab, words=[text], spaces=[False]) snlp_doc = self.snlp(text) text = snlp_doc.text snlp_tokens, snlp_heads = self.get_tokens_with_heads(snlp_doc) words = [] spaces = [] pos = [] tags = [] morphs = [] deps = [] heads = [] lemmas = [] offset = 0 token_texts = [t.text for t in snlp_tokens] is_aligned = True try: words, spaces = self.get_words_and_spaces(token_texts, text) except ValueError: words = token_texts spaces = [True] * len(words) is_aligned = False warnings.warn( "Due to multiword token expansion or an alignment " "issue, the original text has been replaced by space-separated " "expanded tokens.", stacklevel=4, ) offset = 0 for i, word in enumerate(words): if word.isspace() and (i + offset >= len(snlp_tokens) or word != snlp_tokens[i + offset].text): # insert a space token pos.append("SPACE") tags.append("_SP") morphs.append("") deps.append("") lemmas.append(word) # increment any heads left of this position that point beyond # this position to the right (already present in heads) for j in range(0, len(heads)): if j + heads[j] >= i: heads[j] += 1 # decrement any heads right of this position that point beyond # this position to the left (yet to be added from snlp_heads) for j in range(i + offset, len(snlp_heads)): if j + snlp_heads[j] < i + offset: snlp_heads[j] -= 1 # initial space tokens are attached to the following token, # otherwise attach to the preceding token if i == 0: heads.append(1) else: heads.append(-1) offset -= 1 else: token = snlp_tokens[i + offset] assert word == token.text pos.append(token.upos or "") tags.append(token.xpos or token.feats or "") morphs.append(token.feats or "") deps.append(token.deprel or "") heads.append(snlp_heads[i + offset]) lemmas.append(token.lemma or "") doc = Doc( self.vocab, words=words, spaces=spaces, pos=pos, tags=tags, morphs=morphs, lemmas=lemmas, deps=deps, heads=[head + i for i, head in enumerate(heads)], ) ents = [] for ent in snlp_doc.entities: ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type) ents.append(ent_span) if not is_aligned or not all(ents): warnings.warn( f"Can't set named entities because of multi-word token " f"expansion or because the character offsets don't map to " f"valid tokens produced by the Stanza tokenizer:\n" f"Words: {words}\n" f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}", stacklevel=4, ) else: doc.ents = ents if self.svecs is not None: doc.user_token_hooks["vector"] = self.token_vector doc.user_token_hooks["has_vector"] = self.token_has_vector return doc
def match( self, doc: Doc, regex_str: str, partial: bool = True, predef: bool = False, ) -> List[Tuple[int, int]]: """Returns all the regex matches within doc. Matches on the character level and then maps matches back to tokens. If a character cannot be mapped back to a token it means it is a space tokens are split on, which happens when regex matches produce leading or trailing whitespace. Confirm your regex pattern will not do this to avoid this issue. To utilize regex flags, use inline flags. Args: doc: Doc object to search over. regex_str: A string to be compiled to regex, or the key name of a predefined regex pattern. partial: Whether partial matches should be extended to existing span boundaries in doc or not, i.e. the regex only matches part of a token or span. Default is True. predef: Whether regex should be interpreted as a key to a predefined regex pattern or not. Default is False. The included regexes are: "dates" "times" "phones" "phones_with_exts" "links" "emails" "ips" "ipv6s" "prices" "hex_colors" "credit_cards" "btc_addresses" "street_addresses" "zip_codes" "po_boxes" "ssn_number". Returns: A list of span start index and end index pairs as tuples. Raises: TypeError: If regex_str is not a string. Example: >>> import spacy >>> from spaczz.regex import RegexSearcher >>> nlp = spacy.blank("en") >>> searcher = RegexSearcher() >>> doc = nlp.make_doc("My phone number is (555) 555-5555.") >>> searcher.match(doc, "phones", predef=True) [(4, 10)] """ if isinstance(regex_str, str): compiled_regex = self._config.parse_regex(regex_str, predef) else: raise TypeError(f"regex_str must be a str, not {type(regex_str)}.") matches = [] chars_to_tokens = map_chars_to_tokens(doc) for match in compiled_regex.finditer(doc.text): start, end = match.span() span = doc.char_span(start, end) if span: matches.append(span) else: if partial: start_token = chars_to_tokens.get(start) end_token = chars_to_tokens.get(end) if start_token and end_token: span = Span(doc, start_token, end_token + 1) matches.append(span) if matches: return [(match.start, match.end) for match in matches] else: return []
def __call__(self, text): """Convert a Stanza Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ if not text: return Doc(self.vocab) elif text.isspace(): return Doc(self.vocab, words=[text], spaces=[False]) snlp_doc = self.snlp(text) text = snlp_doc.text snlp_tokens, snlp_heads = self.get_tokens_with_heads(snlp_doc) words = [] spaces = [] pos = [] tags = [] deps = [] heads = [] lemmas = [] offset = 0 token_texts = [t.text for t in snlp_tokens] is_aligned = True try: words, spaces = self.get_words_and_spaces(token_texts, text) except ValueError: words = token_texts spaces = [True] * len(words) is_aligned = False warnings.warn( "Due to multiword token expansion or an alignment " "issue, the original text has been replaced by space-separated " "expanded tokens.", stacklevel=4, ) offset = 0 for i, word in enumerate(words): if word.isspace() and word != snlp_tokens[i + offset].text: # insert a space token pos.append(self.vocab.strings.add("SPACE")) tags.append(self.vocab.strings.add("_SP")) deps.append(self.vocab.strings.add("")) lemmas.append(self.vocab.strings.add(word)) # increment any heads left of this position that point beyond # this position to the right (already present in heads) for j in range(0, len(heads)): if j + heads[j] >= i: heads[j] += 1 # decrement any heads right of this position that point beyond # this position to the left (yet to be added from snlp_heads) for j in range(i + offset, len(snlp_heads)): if j + snlp_heads[j] < i + offset: snlp_heads[j] -= 1 # initial space tokens are attached to the following token, # otherwise attach to the preceding token if i == 0: heads.append(1) else: heads.append(-1) offset -= 1 else: token = snlp_tokens[i + offset] assert word == token.text pos.append(self.vocab.strings.add(token.upos or "")) tags.append( self.vocab.strings.add(token.xpos or token.feats or "")) deps.append(self.vocab.strings.add(token.deprel or "")) heads.append(snlp_heads[i + offset]) lemmas.append(self.vocab.strings.add(token.lemma or "")) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) ents = [] for ent in snlp_doc.entities: ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type) ents.append(ent_span) if not is_aligned or not all(ents): warnings.warn( f"Can't set named entities because of multi-word token " f"expansion or because the character offsets don't map to " f"valid tokens produced by the Stanza tokenizer:\n" f"Words: {words}\n" f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}", stacklevel=4, ) else: doc.ents = ents # Overwrite lemmas separately to prevent them from being overwritten by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) or any(tags): doc.is_tagged = True if any(deps) or any(heads): doc.is_parsed = True return doc
def __call__(self, text): """Convert a Stanza Doc to a spaCy Doc. text (unicode): The text to process. RETURNS (spacy.tokens.Doc): The spaCy Doc object. """ snlp_doc = self.snlp(text) if text else Document("") text = snlp_doc.text tokens, heads = self.get_tokens_with_heads(snlp_doc) if not len(tokens): return Doc(self.vocab) words = [] spaces = [] pos = [] tags = [] deps = [] lemmas = [] offset = 0 is_aligned = self.check_aligned(text, tokens) for i, token in enumerate(tokens): span = text[offset:] if not len(span): break while len(span) and span[0].isspace(): # If we encounter leading whitespace, skip one character ahead offset += 1 span = text[offset:] words.append(token.text) # Make sure all strings are in the vocabulary pos.append(self.vocab.strings.add(token.upos or "")) tags.append(self.vocab.strings.add(token.xpos or "")) deps.append(self.vocab.strings.add(token.deprel or "")) lemmas.append(self.vocab.strings.add(token.lemma or "")) offset += len(token.text) span = text[offset:] if i == len(tokens) - 1: spaces.append(False) elif not is_aligned: spaces.append(True) else: next_token = tokens[i + 1] spaces.append(not span.startswith(next_token.text)) attrs = [POS, TAG, DEP, HEAD] array = numpy.array(list(zip(pos, tags, deps, heads)), dtype="uint64") doc = Doc(self.vocab, words=words, spaces=spaces).from_array(attrs, array) ents = [] for ent in snlp_doc.entities: ent_span = doc.char_span(ent.start_char, ent.end_char, ent.type) ents.append(ent_span) if not all(ents): warnings.warn( f"Can't set named entities because the character offsets don't " f"map to valid tokens produced by the Stanza tokenizer:\n" f"Words: {words}\n" f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}", stacklevel=4, ) else: doc.ents = ents # Overwrite lemmas separately to prevent them from being overwritten by spaCy lemma_array = numpy.array([[lemma] for lemma in lemmas], dtype="uint64") doc.from_array([LEMMA], lemma_array) if any(pos) and any(tags): doc.is_tagged = True if any(deps): doc.is_parsed = True return doc