def get_spans(word_seqs): vocab = Vocab() docs = [Doc(vocab, words=words) for words in word_seqs] return [doc[:] for doc in docs]
print('time_taken: {}'.format(time.time() - t_0)) d = pickle.load(open(data_dir + '/word_count.p', 'rb')) sort_list = sorted(d.items(), key=lambda item: item[1], reverse=True) # nlp = spacy.load('en_core_web_md') # vector_data = {u"dog": np.random.uniform(-1, 1, (300,)), # u"cat": np.random.uniform(-1, 1, (300,)), # u"orange": np.random.uniform(-1, 1, (300,)), # u"it's": np.random.uniform(-1, 1, (300,))} nlp_lg = spacy.load('en_core_web_md') print(nlp_lg("i'm").vector) emb_list = nlp_lg.tokenizer.pipe([s[0] for s in sort_list]) vec_list = [a.vector for a in emb_list] nlp = spacy.blank('en') # vocab = Vocab(strings=[u"hello", u"world"]) # nlp.vocab = Vocab([s[0] for s in sort_list]) nlp.vocab = Vocab(strings=[s[0] for s in sort_list]) # vector_data = sort_list.keys() # # nlp.vocab = Vocab() # nlp.vocab.vectors.resize((int(4), int(300))) # nlp.vocab.vectors.name = 'spacy_pretrained_vectors' # spacy.vocab.link_vectors_to_models(nlp.vocab) # for word, vector in vector_data.items(): for word, vec in zip(sort_list, vec_list): # nlp.vocab.set_vector(word[0], np.random.uniform(-1, 1, (300,))) nlp.vocab.set_vector(word[0], vec) # spacy.vocab.link_vectors_to_models(nlp.vocab) # nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) nlp.to_disk(data_dir + '/spacy_tok_combined_30080')
def vocab(): return Vocab()
def wp(name): return PyTT_WordPiecer.from_pretrained(Vocab(), pytt_name=name)
path_to_db = "/media/norpheo/mySQL/db/ssorc" # nlp_model = "en_core_web_sm_nertrained_v3" nlp_model = "en_wa_v2" path_to_mlgenome = os.path.join(path_to_db, "mlgenome", nlp_model) if not os.path.isdir(path_to_mlgenome): print(f"Create Directory {path_to_mlgenome}") os.mkdir(path_to_mlgenome) with open(os.path.join(path_to_mlgenome, "ml_acronyms.pickle"), "rb") as handle: acronyms = pickle.load(handle) path_to_annotations = os.path.join(path_to_db, "annotations_version", nlp_model) vocab = Vocab().from_disk(os.path.join(path_to_annotations, "spacy.vocab")) infoDF = pd.read_pickle(os.path.join(path_to_annotations, 'info_db.pandas')) window_size = 3 mentions = list() unique_mentions = list() um_set = dict() ml_acronyms = dict() lt = LoopTimer(update_after=100, avg_length=1000, target=len(infoDF)) for abstract_id, row in infoDF.iterrows(): file_path = os.path.join(path_to_annotations, f"{abstract_id}.spacy") doc = Doc(vocab).from_disk(file_path) for sentence in doc.sents:
def test_deserialize_vocab_seen_entries(strings, lex_attr): # Reported in #2153 vocab = Vocab(strings=strings) vocab.from_bytes(vocab.to_bytes()) assert len(vocab.strings) == len(strings)
def test_oracle_dev_sentence(vocab, arc_eager): words_deps_heads = """ Rolls-Royce nn Inc. Motor nn Inc. Cars nn Inc. Inc. nsubj said said ROOT said it nsubj expects expects ccomp said its poss sales U.S. nn sales sales nsubj steady to aux steady remain cop steady steady xcomp expects at prep steady about quantmod 1,200 1,200 num cars cars pobj at in prep steady 1990 pobj in . punct said """ expected_transitions = [ "S", # Shift "Rolls-Royce" "S", # Shift 'Motor' "S", # Shift 'Cars' "L-nn", # Attach 'Cars' to 'Inc.' "L-nn", # Attach 'Motor' to 'Inc.' "L-nn", # Attach 'Rolls-Royce' to 'Inc.' "S", # Shift "Inc." "L-nsubj", # Attach 'Inc.' to 'said' "S", # Shift 'said' "S", # Shift 'it' "L-nsubj", # Attach 'it.' to 'expects' "R-ccomp", # Attach 'expects' to 'said' "S", # Shift 'its' "S", # Shift 'U.S.' "L-nn", # Attach 'U.S.' to 'sales' "L-poss", # Attach 'its' to 'sales' "S", # Shift 'sales' "S", # Shift 'to' "S", # Shift 'remain' "L-cop", # Attach 'remain' to 'steady' "L-aux", # Attach 'to' to 'steady' "L-nsubj", # Attach 'sales' to 'steady' "R-xcomp", # Attach 'steady' to 'expects' "R-prep", # Attach 'at' to 'steady' "S", # Shift 'about' "L-quantmod", # Attach "about" to "1,200" "S", # Shift "1,200" "L-num", # Attach "1,200" to "cars" "R-pobj", # Attach "cars" to "at" "D", # Reduce "cars" "D", # Reduce "at" "R-prep", # Attach "in" to "steady" "R-pobj", # Attach "1990" to "in" "D", # Reduce "1990" "D", # Reduce "in" "D", # Reduce "steady" "D", # Reduce "expects" "R-punct", # Attach "." to "said" "D", # Reduce "." "D", # Reduce "said" ] gold_words = [] gold_deps = [] gold_heads = [] for line in words_deps_heads.strip().split("\n"): line = line.strip() if not line: continue word, dep, head = line.split() gold_words.append(word) gold_deps.append(dep) gold_heads.append(head) gold_heads = [gold_words.index(head) for head in gold_heads] for dep in gold_deps: arc_eager.add_action(2, dep) # Left arc_eager.add_action(3, dep) # Right doc = Doc(Vocab(), words=gold_words) example = Example.from_dict(doc, {"heads": gold_heads, "deps": gold_deps}) ae_oracle_actions = arc_eager.get_oracle_sequence(example, _debug=False) ae_oracle_actions = [arc_eager.get_class_name(i) for i in ae_oracle_actions] assert ae_oracle_actions == expected_transitions
def main(json_loc: Path, train_file: Path, dev_file: Path, test_file: Path, test_split=0.189, train_split=0.709): """Creating the corpus from the Prodigy annotations.""" Doc.set_extension("rel", default={}) vocab = Vocab() docs = {"train": [], "dev": [], "test": []} ids = {"train": set(), "dev": set(), "test": set()} count_all = {"train": 0, "dev": 0, "test": 0} count_pos = {"train": 0, "dev": 0, "test": 0} long_rel_count = 0 #how many relations are longer error_count_rel = 0 #how often is something different than ARGO, ARG1, ARG with json_loc.open("r", encoding="utf8") as jsonfile: length_training_data = len([ True for line in jsonfile if json.loads(line)["answer"] == "accept" ]) msg.info(f"Number of accepted recipes: {length_training_data}") with json_loc.open("r", encoding="utf8") as jsonfile: for line in jsonfile: example = json.loads(line) #one recipe span_starts = set() if example["answer"] == "accept": neg = 0 pos = 0 try: # Parse the tokens -> example["tokens"] = list of dicts words = [t["text"] for t in example["tokens"] ] #list containing all words spaces = [ t["ws"] for t in example["tokens"] ] #list containing ws is behind word (ws = True/False) doc = Doc(vocab, words=words, spaces=spaces) # Parse the entities spans = example[ "spans"] #list of dicts containing entities entities = [] span_end_to_start = {} ents_dict = {} for span in spans: #every detected span entity = doc.char_span( span["start"], span["end"], label=span["label"] ) #"start" = wievielter character ist start character des spans im doc span_end_to_start[span["token_end"]] = span[ "token_start"] #end_token of span as key for start_token (start token = wievielter token in doc) entities.append(entity) #appended to list span_starts.add(span["token_start"]) #added to set ents_dict[span["token_start"]] = (span["label"], span["token_start"]) doc.ents = entities #entity list assigned as doc entites # Parse the relations rels = {} # create token combinations for x1 in span_starts: #VERBS_TO_OTHER 1a if VERBS_TO_OTHER == True: if ents_dict[x1][0] == "V": #filter entity type for x2 in span_starts: if ents_dict[x2][0] in [ "Z", "TOOL", "ATTR", "TEMP", "DAUER", "ZEITP", "PRÄP" ]: #filter entity type #DIFF_FRONT_BACK 1a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): rels[(x1, x2)] = {} else: pass #DIFF_FRONT_BACK 1b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) rels[(x1, x2)] = { } #every possible span combination becomes key for individual dict (1,1), (1,2) ... #VERBS_TO_OTHER 1b else: for x2 in span_starts: #DIFF_FRONT_BACK 2a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): rels[(x1, x2)] = {} else: pass #DIFF_FRONT_BACK 2b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) rels[(x1, x2)] = { } #every possible span combination becomes key for individual dict (1,1), (1,2) ... relations = example[ "relations"] #relations is list of dict for relation in relations: # the 'head' and 'child' annotations refer to the end token in the span # but we want the first token start = span_end_to_start[relation[ "head"]] #wievielter token ist start token des head end = span_end_to_start[relation[ "child"]] #wievielter token ist start token des child label = relation["label"] #DETAILED_ARGS 1a if DETAILED_ARGS == True: if label == "ARG0": if ents_dict[end][0] not in ["Z", "TOOL"]: label = MAP_LABELS_ARG[ents_dict[end][0]] else: label = MAP_LABELS_ARG0[ents_dict[end][ 0]] #assign new label based on span type elif label == "ARG1": if ents_dict[end][0] not in ["Z", "TOOL"]: label = MAP_LABELS_ARG[ents_dict[end][0]] else: label = MAP_LABELS_ARG1[ents_dict[end][0]] elif label == "ARG": if ents_dict[end][0] in ["Z", "TOOL"]: if ents_dict[end][0] == "Z": label = "Arg0Z" elif ents_dict[end][0] == "TOOL": label = "Arg1Tool" else: label = MAP_LABELS_ARG[ents_dict[end][0]] else: error_count_rel += 1 #DETAILED_ARGS 1b else: label = MAP_LABELS_STANDARD[ label] #MAP_LABELS = dict containing label as key # Positive relations are being added try: if label not in rels[( start, end )]: #check if label already exists for token combination rels[( start, end )][label] = 1.0 #initialize label as new key with value 1.0 pos += 1 #positive case except: long_rel_count += 1 #error only if relation exists in annotation but isn't a valid token combi (too long/not starting from verb) pass # The annotation is complete, so fill in zero's where the data is missing for x1 in span_starts: #VERBS_TO_OTHER 2a if VERBS_TO_OTHER == True: if ents_dict[x1][0] == "V": #filter entity type for x2 in span_starts: if ents_dict[x2][0] in [ "Z", "TOOL", "ATTR", "TEMP", "DAUER", "ZEITP", "PRÄP" ]: #filter entity type #DIFF_FRONT_BACK 2a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): #DETAILED_ARGS 2a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values( )) + list( MAP_LABELS_ARG1. values()) + list( MAP_LABELS_ARG. values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #DETAILED_ARGS 2b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #DIFF_FRONT_BACK 2b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) #DETAILED_ARGS 3a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values( )) + list( MAP_LABELS_ARG1. values()) + list( MAP_LABELS_ARG. values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #DETAILED_ARGS 3b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[( x1, x2 )][label] = 0.0 #span combination with label as key gets 0 as value #VERBS_TO_OTHER 2b else: for x2 in span_starts: #DIFF_FRONT_BACK 3a if DIFF_FRONT_BACK == True: if ((x1 - x2) >= 0 and (x1 - x2) <= BACK) or ( (x1 - x2) < 0 and (x1 - x2) >= FRONT * -1): #DETAILED_ARGS 4a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values() ) + list(MAP_LABELS_ARG1.values( )) + list(MAP_LABELS_ARG.values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #DETAILED_ARGS 4b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #DIFF_FRONT_BACK 3b else: if abs( ents_dict[x1][1] - ents_dict[x2][1] ) <= TOKEN_LENGTH: #filter token distance (match with config?) #DETAILED_ARGS 5a if DETAILED_ARGS == True: merged_labels = list( MAP_LABELS_ARG0.values() ) + list(MAP_LABELS_ARG1.values( )) + list(MAP_LABELS_ARG.values()) for label in merged_labels: if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #DETAILED_ARGS 5b else: for label in MAP_LABELS_STANDARD.values( ): #for every label if label not in rels[( x1, x2 )]: #if label isn't assigned to span combination neg += 1 rels[(x1, x2)][label] = 0.0 #print(rels) doc._.rel = rels # rels = {(1,1): {Arg0 : 1, Arg1 : 0, Arg : 0}, (1,2): {Arg0 : 0, ...}} # only keeping documents with at least 1 positive case (if doc isn't annotated relations = empty list) if pos > 0: recipe_id = example["_input_hash"] if len(docs["train"]) < round( train_split * length_training_data): ids["train"].add(recipe_id) docs["train"].append(doc) count_pos["train"] += pos count_all["train"] += pos + neg elif len(docs["test"]) < round( test_split * length_training_data): ids["test"].add(recipe_id) docs["test"].append(doc) count_pos["test"] += pos count_all["test"] += pos + neg else: ids["dev"].add(recipe_id) docs["dev"].append(doc) count_pos["dev"] += pos count_all["dev"] += pos + neg except KeyError as e: msg.fail( f"Skipping doc because of key error: {e} in {example['_input_hash']}" ) msg.info( f"{long_rel_count} relations have been cut because tokens are too far apart." ) docbin = DocBin(docs=docs["train"], store_user_data=True) docbin.to_disk(train_file) msg.info( f"{len(docs['train'])} training recipes from {len(ids['train'])} unique recipes, " f"{count_pos['train']}/{count_all['train']} pos instances.") docbin = DocBin(docs=docs["dev"], store_user_data=True) docbin.to_disk(dev_file) msg.info( f"{len(docs['dev'])} dev recipes from {len(ids['dev'])} unique recipes, " f"{count_pos['dev']}/{count_all['dev']} pos instances.") docbin = DocBin(docs=docs["test"], store_user_data=True) docbin.to_disk(test_file) msg.info( f"{len(docs['test'])} test recipes from {len(ids['test'])} unique recipes, " f"{count_pos['test']}/{count_all['test']} pos instances.")
def __init__( self, vocab_path="", model_name="bert-base-cased", max_edit_dist=10, debug=False, performance=False, ): """To create an object for this class. It does not require any special Args: vocab_path (str, optional): Vocabulary file path to be used by the model . Defaults to "". model_name (str, optional): Pretrained BERT model name. Defaults to "bert-base-cased". max_edit_dist (int, optional): Maximum edit distance between two words. Defaults to 10. debug (bool, optional): This help prints logs as the data flows through the class. Defaults to False. performance (bool, optional): This is used to print the time taken by individual steps in spell check. Defaults to False. """ if ( not isinstance(vocab_path, str) or not isinstance(debug, type(True)) or not isinstance(performance, type(True)) ): raise TypeError( "Please check datatype provided. vocab_path should be str," " debug and performance should be bool" ) try: int(float(max_edit_dist)) except ValueError: raise ValueError( f"cannot convert {max_edit_dist} to int. Please provide a " f"valid integer " ) if vocab_path != "": try: # First open() for user specified word addition to vocab with open(vocab_path, encoding="utf8") as f: # if want to remove '[unusedXX]' from vocab # words = [ # line.rstrip() # for line in f # if not line.startswith("[unused") # ] words = [line.strip() for line in f] # The below code adds the necessary words like numbers # /punctuations/tokenizer specific words like [PAD]/[ # unused0]/##M current_path = os.path.dirname(__file__) vocab_path = os.path.join(current_path, "data", "vocab.txt") extra_token = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] words.extend(extra_token) with open(vocab_path, encoding="utf8") as f: # if want to remove '[unusedXX]' from vocab # words = [ # line.rstrip() # for line in f # if not line.startswith("[unused") # ] for line in f: extra_token = line.strip() if extra_token.startswith("[unused"): words.append(extra_token) elif extra_token.startswith("##"): words.append(extra_token) elif len(extra_token) == 1: words.append(extra_token) if debug: debug_file_path = os.path.join( current_path, "tests", "debugFile.txt" ) with open(debug_file_path, "w+") as new_file: new_file.write("\n".join(words)) print("Final vocab at " + debug_file_path) except Exception as e: print(e) warnings.warn("Using default vocab") vocab_path = "" words = [] self.max_edit_dist = int(float(max_edit_dist)) self.model_name = model_name self.BertTokenizer = AutoTokenizer.from_pretrained(self.model_name) if vocab_path == "": words = list(self.BertTokenizer.get_vocab().keys()) self.vocab = Vocab(strings=words) logging.getLogger("transformers").setLevel(logging.ERROR) self.BertModel = AutoModelForMaskedLM.from_pretrained(self.model_name) self.mask = self.BertTokenizer.mask_token self.debug = debug self.performance = performance if not Doc.has_extension("contextual_spellCheck"): Doc.set_extension("contextual_spellCheck", default=True) Doc.set_extension("performed_spellCheck", default=False) Doc.set_extension("suggestions_spellCheck", default={}) Doc.set_extension("outcome_spellCheck", default="") Doc.set_extension("score_spellCheck", default=None) Span.set_extension( "get_has_spellCheck", getter=self.span_require_spell_check ) Span.set_extension( "score_spellCheck", getter=self.span_score_spell_check ) Token.set_extension( "get_require_spellCheck", getter=self.token_require_spell_check ) Token.set_extension( "get_suggestion_spellCheck", getter=self.token_suggestion_spell_check, ) Token.set_extension( "score_spellCheck", getter=self.token_score_spell_check )
def test_issue600(): vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) doc = Doc(vocab, words=["hello"]) doc[0].tag_ = "NN"
def test_issue743(): doc = Doc(Vocab(), ["hello", "world"]) token = doc[0] s = set([token]) items = list(s) assert items[0] is token
def test_issue589(): vocab = Vocab() vocab.strings.set_frozen(True) doc = Doc(vocab, words=["whata"]) assert doc
def __init__(self, vocab_path="", debug=False, performance=False): """To create an object for this class. It does not require any special Args: vocab_path (str, optional): Vocabulary file path to be used by the model . Defaults to "". debug (bool, optional): This help prints logs as the data flows throught the class. Defaults to False. performance (bool, optional): This is used to print the time taken by individual steps in spell check. Defaults to False. """ if ((type(vocab_path) != type("")) or (type(debug) != type(True)) or (type(performance) != type(True))): raise TypeError( "Please check datatype provided. vocab_path should be str, debug and performance should be bool" ) if vocab_path != "": try: # First open() for user specified word addition to vocab with open(vocab_path, encoding="utf8") as f: # if want to remove '[unusedXX]' from vocab # words = [line.rstrip() for line in f if not line.startswith('[unused')] words = [line.strip() for line in f] # The below code adds the neccesary words like numbers/puncutations/tokenizer specific words like [PAD]/[unused0]/##M currentPath = os.path.dirname(__file__) vocab_path = os.path.join(currentPath, "data", "vocab.txt") extraToken = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] words.extend(extraToken) with open(vocab_path, encoding="utf8") as f: # if want to remove '[unusedXX]' from vocab # words = [line.rstrip() for line in f if not line.startswith('[unused')] for line in f: extraToken = line.strip() if extraToken.startswith("[unused"): words.append(extraToken) elif extraToken.startswith("##"): words.append(extraToken) elif len(extraToken) == 1: words.append(extraToken) if debug: debugFilePath = os.path.join(currentPath, "tests", "debugFile.txt") with open(debugFilePath, "w+") as newFile: newFile.write("\n".join(words)) print("Final vocab at " + debugFilePath) except Exception as e: print(e) warnings.warn("Using default vocab") vocab_path = "" words = [] if vocab_path == "": currentPath = os.path.dirname(__file__) vocab_path = os.path.join(currentPath, "data/vocab.txt") with open(vocab_path, encoding="utf8") as f: # if want to remove '[unusedXX]' from vocab # words = [line.rstrip() for line in f if not line.startswith('[unused')] words = [line.strip() for line in f] self.vocab = Vocab(strings=words) self.BertTokenizer = AutoTokenizer.from_pretrained("bert-base-cased") self.BertModel = AutoModelWithLMHead.from_pretrained("bert-base-cased") self.mask = self.BertTokenizer.mask_token self.debug = debug self.performance = performance if not Doc.has_extension("contextual_spellCheck"): Doc.set_extension("contextual_spellCheck", default=True) Doc.set_extension("performed_spellCheck", default=False) # {originalToken-1:[suggestedToken-1,suggestedToken-2,..], # originalToken-2:[...]} Doc.set_extension("suggestions_spellCheck", default={}) Doc.set_extension("outcome_spellCheck", default="") Doc.set_extension("score_spellCheck", default=None) Span.set_extension("get_has_spellCheck", getter=self.span_require_spellCheck) Span.set_extension("score_spellCheck", getter=self.span_score_spellCheck) Token.set_extension("get_require_spellCheck", getter=self.token_require_spellCheck) Token.set_extension("get_suggestion_spellCheck", getter=self.token_suggestion_spellCheck) Token.set_extension("score_spellCheck", getter=self.token_score_spellCheck)
def test_doc_retokenize_spans_entity_merge_iob(en_vocab): # Test entity IOB stays consistent after merging words = ["a", "b", "c", "d", "e"] doc = Doc(Vocab(), words=words) doc.ents = [ (doc.vocab.strings.add("ent-abc"), 0, 3), (doc.vocab.strings.add("ent-d"), 3, 4), ] assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" assert doc[2].ent_iob_ == "I" assert doc[3].ent_iob_ == "B" with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2]) assert len(doc) == len(words) - 1 assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" # Test that IOB stays consistent with provided IOB words = ["a", "b", "c", "d", "e"] doc = Doc(Vocab(), words=words) with doc.retokenize() as retokenizer: attrs = {"ent_type": "ent-abc", "ent_iob": 1} retokenizer.merge(doc[0:3], attrs=attrs) retokenizer.merge(doc[3:5], attrs=attrs) assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" # if no parse/heads, the first word in the span is the root and provides # default values words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] doc = Doc(Vocab(), words=words) doc.ents = [ (doc.vocab.strings.add("ent-de"), 3, 5), (doc.vocab.strings.add("ent-fg"), 5, 7), ] assert doc[3].ent_iob_ == "B" assert doc[4].ent_iob_ == "I" assert doc[5].ent_iob_ == "B" assert doc[6].ent_iob_ == "I" with doc.retokenize() as retokenizer: retokenizer.merge(doc[2:4]) retokenizer.merge(doc[4:6]) retokenizer.merge(doc[7:9]) assert len(doc) == 6 assert doc[3].ent_iob_ == "B" assert doc[3].ent_type_ == "ent-de" assert doc[4].ent_iob_ == "B" assert doc[4].ent_type_ == "ent-fg" # if there is a parse, span.root provides default values words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] heads = [0, 0, 3, 0, 0, 0, 5, 0, 0] ents = ["O"] * len(words) ents[3] = "B-ent-de" ents[4] = "I-ent-de" ents[5] = "B-ent-fg" ents[6] = "I-ent-fg" deps = ["dep"] * len(words) en_vocab.strings.add("ent-de") en_vocab.strings.add("ent-fg") en_vocab.strings.add("dep") doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents) assert doc[2:4].root == doc[3] # root of 'c d' is d assert doc[4:6].root == doc[4] # root is 'e f' is e with doc.retokenize() as retokenizer: retokenizer.merge(doc[2:4]) retokenizer.merge(doc[4:6]) retokenizer.merge(doc[7:9]) assert len(doc) == 6 assert doc[2].ent_iob_ == "B" assert doc[2].ent_type_ == "ent-de" assert doc[3].ent_iob_ == "I" assert doc[3].ent_type_ == "ent-de" assert doc[4].ent_iob_ == "B" assert doc[4].ent_type_ == "ent-fg" # check that B is preserved if span[start] is B words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] heads = [0, 0, 3, 4, 0, 0, 5, 0, 0] ents = ["O"] * len(words) ents[3] = "B-ent-de" ents[4] = "I-ent-de" ents[5] = "B-ent-de" ents[6] = "I-ent-de" deps = ["dep"] * len(words) doc = Doc(en_vocab, words=words, heads=heads, deps=deps, ents=ents) with doc.retokenize() as retokenizer: retokenizer.merge(doc[3:5]) retokenizer.merge(doc[5:7]) assert len(doc) == 7 assert doc[3].ent_iob_ == "B" assert doc[3].ent_type_ == "ent-de" assert doc[4].ent_iob_ == "B" assert doc[4].ent_type_ == "ent-de"
def test_issue1807(): """Test vocab.set_vector also adds the word to the vocab.""" vocab = Vocab(vectors_name="test_issue1807") assert "hello" not in vocab vocab.set_vector("hello", numpy.ones((50,), dtype="f")) assert "hello" in vocab
from spacy.language import Language from whatlies.language import SpacyLanguage from whatlies.transformers import ( Transformer, Umap, Pca, Noise, AddRandom, Tsne, OpenTsne, Ivis, Normalizer, ) vocab = Vocab().from_disk("tests/custom_test_vocab/") words = list(vocab.strings) lang = SpacyLanguage(nlp=Language(vocab=vocab, meta={"lang": "en"})) emb = lang[words] transformers = [ Umap(2), Umap(3), Pca(2), Pca(3), Noise(0.1), Noise(0.01), AddRandom(n=4), AddRandom(n=1), lambda d: d | (d["man"] - d["woman"]), Tsne(2, n_iter=250),
def test_issue1967(label): ner = EntityRecognizer(Vocab()) entry = ([0], ["word"], ["tag"], [0], ["dep"], [label]) gold_parses = [(None, [(entry, None)])] ner.moves.get_actions(gold_parses=gold_parses)
# Similarity of two documents print(doc1, '<->', doc2, doc1.similarity(doc2)) # Similarity of tokens and spans french_fries = doc1[2:4] burgers = doc1[5] print(french_fries, '<->', burgers, french_fries.similarity(burgers)) print() doc = nlp('I love coffee') print(doc.vocab.strings['coffee']) # 3197928453018144401 print(doc.vocab.strings[3197928453018144401]) # 'coffee' print() doc = nlp('I love coffee') for word in doc: lexeme = doc.vocab[word.text] print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_, lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_) print() doc = nlp("I love coffee") # Original Doc print(doc.vocab.strings["coffee"]) # 3197928453018144401 print(doc.vocab.strings[3197928453018144401]) # 'coffee' 👍 empty_doc = Doc(Vocab()) # New Doc with empty Vocab # empty_doc.vocab.strings[3197928453018144401] will raise an error :( empty_doc.vocab.strings.add("coffee") # Add "coffee" and generate hash print(empty_doc.vocab.strings[3197928453018144401]) # 'coffee' 👍 new_doc = Doc(doc.vocab) # Create new doc with first doc's vocab print(new_doc.vocab.strings[3197928453018144401]) # 'coffee' 👍 # displacy.serve(doc, style='ent')
def test_serialize_vocab(en_vocab, text): text_hash = en_vocab.strings.add(text) vocab_bytes = en_vocab.to_bytes(exclude=["lookups"]) new_vocab = Vocab().from_bytes(vocab_bytes) assert new_vocab.strings[text_hash] == text assert new_vocab.to_bytes(exclude=["lookups"]) == vocab_bytes
def test_Example_from_dict_with_morphology(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) example = Example.from_dict(predicted, annots) for i, token in enumerate(example.reference): assert str(token.morph) == annots["morphs"][i]
def test_doc_api_has_vector(): vocab = Vocab() vocab.reset_vectors(width=2) vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f")) doc = Doc(vocab, words=["kitten"]) assert doc.has_vector
def test_Example_from_dict_basic(): example = Example.from_dict(Doc(Vocab(), words=["hello", "world"]), {"words": ["hello", "world"]}) assert isinstance(example.x, Doc) assert isinstance(example.y, Doc)
def main(model=None, output_dir=None, n_iter=50): """Load the model, create the KB and pretrain the entity encodings. If an output_dir is provided, the KB will be stored there in a file 'kb'. The updated vocab will also be written to a directory in the output_dir.""" nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) # check the length of the nlp vectors if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: raise ValueError( "The `nlp` object should have access to pretrained word vectors, " " cf. https://spacy.io/usage/models#languages.") kb = KnowledgeBase(vocab=nlp.vocab) # set up the data entity_ids = [] descriptions = [] freqs = [] for key, value in ENTITIES.items(): desc, freq = value entity_ids.append(key) descriptions.append(desc) freqs.append(freq) # training entity description encodings # this part can easily be replaced with a custom entity encoder encoder = EntityEncoder( nlp=nlp, input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter, ) encoder.train(description_list=descriptions, to_print=True) # get the pretrained entity vectors embeddings = encoder.apply_encoder(descriptions) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) # adding aliases, the entities need to be defined in the KB beforehand kb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.24, 0.7 ], # the sum of these probabilities should not exceed 1 ) # test the trained model print() _print_kb(kb) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() kb_path = str(output_dir / "kb") kb.dump(kb_path) print() print("Saved KB to", kb_path) vocab_path = output_dir / "vocab" kb.vocab.to_disk(vocab_path) print("Saved vocab to", vocab_path) print() # test the saved model # always reload a knowledge base with the same vocab instance! print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) kb2 = KnowledgeBase(vocab=vocab2) kb2.load_bulk(kb_path) _print_kb(kb2) print()
def test_Example_from_dict_with_entities_invalid(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) with pytest.warns(UserWarning): example = Example.from_dict(predicted, annots) assert len(list(example.reference.ents)) == 0
def component(vocab): return Transformer(Vocab(), DummyTransformer())
def test_Example_from_dict_with_entities_overlapping(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) with pytest.raises(ValueError): Example.from_dict(predicted, annots)
def vocab(): return Vocab(lex_attr_getters={NORM: lambda s: s})
def test_Example_from_dict_with_links_invalid(annots): vocab = Vocab() predicted = Doc(vocab, words=annots["words"]) with pytest.raises(ValueError): Example.from_dict(predicted, annots)
def test_issue4373(): """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" matcher = Matcher(Vocab()) assert isinstance(matcher.vocab, Vocab) matcher = PhraseMatcher(Vocab()) assert isinstance(matcher.vocab, Vocab)
def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): """Create a blank model with the specified vocab, set up the pipeline and train the entity linker. The `vocab` should be the one used during creation of the KB.""" vocab = Vocab().from_disk(vocab_path) # create blank Language class with correct vocab nlp = spacy.blank("en", vocab=vocab) nlp.vocab.vectors.name = "spacy_pretrained_vectors" print("Created blank 'en' model with vocab from '%s'" % vocab_path) # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "entity_linker" not in nlp.pipe_names: entity_linker = nlp.create_pipe("entity_linker") kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_path) print("Loaded Knowledge Base from '%s'" % kb_path) entity_linker.set_kb(kb) nlp.add_pipe(entity_linker, last=True) else: entity_linker = nlp.get_pipe("entity_linker") kb = entity_linker.kb # make sure the annotated examples correspond to known identifiers in the knowlege base kb_ids = kb.get_entity_strings() for text, annotation in TRAIN_DATA: for offset, kb_id_dict in annotation["links"].items(): new_dict = {} for kb_id, value in kb_id_dict.items(): if kb_id in kb_ids: new_dict[kb_id] = value else: print("Removed", kb_id, "from training because it is not in the KB.") annotation["links"][offset] = new_dict # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] with nlp.disable_pipes(*other_pipes): # only train entity linker # reset and initialize the weights randomly optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.2, # dropout - make it harder to memorise data losses=losses, sgd=optimizer, ) print(itn, "Losses", losses) # test the trained model _apply_model(nlp) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print() print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) _apply_model(nlp2)