def test_issue1242(): nlp = English() doc = nlp("") assert len(doc) == 0 docs = list(nlp.pipe(["", "hello"])) assert len(docs[0]) == 0 assert len(docs[1]) == 1
def test_issue3449(): nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) text1 = "He gave the ball to I. Do you want to go to the movies with I?" text2 = "He gave the ball to I. Do you want to go to the movies with I?" text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" t1 = nlp(text1) t2 = nlp(text2) t3 = nlp(text3) assert t1[5].text == "I" assert t2[5].text == "I" assert t3[5].text == "I"
def test_issue3410(): texts = ["Hello world", "This is a test"] nlp = English() matcher = Matcher(nlp.vocab) phrasematcher = PhraseMatcher(nlp.vocab) with pytest.deprecated_call(): docs = list(nlp.pipe(texts, n_threads=4)) with pytest.deprecated_call(): docs = list(nlp.tokenizer.pipe(texts, n_threads=4)) with pytest.deprecated_call(): list(matcher.pipe(docs, n_threads=4)) with pytest.deprecated_call(): list(phrasematcher.pipe(docs, n_threads=4))
def test_issue3468(): """Test that sentence boundaries are set correctly so Doc.is_sentenced can be restored after serialization.""" nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) doc = nlp("Hello world") assert doc[0].is_sent_start assert doc.is_sentenced assert len(list(doc.sents)) == 1 doc_bytes = doc.to_bytes() new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) assert new_doc[0].is_sent_start assert new_doc.is_sentenced assert len(list(new_doc.sents)) == 1
def main(): # For simplicity, we start off with only the blank English Language class # and no model or pre-defined pipeline loaded. nlp = English() rest_countries = RESTCountriesComponent(nlp) # initialise component nlp.add_pipe(rest_countries) # add it to the pipeline doc = nlp(u"Some text about Colombia and the Czech Republic") print('Pipeline', nlp.pipe_names) # pipeline contains component name print('Doc has countries', doc._.has_country) # Doc contains countries for token in doc: if token._.is_country: print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag) # country data print('Entities', [(e.text, e.label_) for e in doc.ents]) # entities
def test_issue1494(): infix_re = re.compile(r"""[^a-z]""") test_cases = [ ("token 123test", ["token", "1", "2", "3", "test"]), ("token 1test", ["token", "1test"]), ("hello...test", ["hello", ".", ".", ".", "test"]), ] def new_tokenizer(nlp): return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer) nlp = English() nlp.tokenizer = new_tokenizer(nlp) for text, expected in test_cases: assert [token.text for token in nlp(text)] == expected
def main(text="Alphabet Inc. is the company behind Google.", *companies): # For simplicity, we start off with only the blank English Language class # and no model or pre-defined pipeline loaded. nlp = English() if not companies: # set default companies if none are set via args companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc. component = TechCompanyRecognizer(nlp, companies) # initialise component nlp.add_pipe(component, last=True) # add last to the pipeline doc = nlp(text) print('Pipeline', nlp.pipe_names) # pipeline contains component name print('Tokens', [t.text for t in doc]) # company names from the list are merged print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
def test_issue1506(): def string_generator(): for _ in range(10001): yield "It's sentence produced by that bug." for _ in range(10001): yield "I erase some hbdsaj lemmas." for _ in range(10001): yield "I erase lemmas." for _ in range(10001): yield "It's sentence produced by that bug." for _ in range(10001): yield "It's sentence produced by that bug." nlp = English() for i, d in enumerate(nlp.pipe(string_generator())): # We should run cleanup more than one time to actually cleanup data. # In first run — clean up only mark strings as «not hitted». if i == 10000 or i == 20000 or i == 30000: gc.collect() for t in d: str(t.lemma_)
def test_issue1488(): prefix_re = re.compile(r"""[\[\("']""") suffix_re = re.compile(r"""[\]\)"']""") infix_re = re.compile(r"""[-~\.]""") simple_url_re = re.compile(r"""^https?://""") def my_tokenizer(nlp): return Tokenizer( nlp.vocab, {}, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=simple_url_re.match, ) nlp = English() nlp.tokenizer = my_tokenizer(nlp) doc = nlp("This is a test.") for token in doc: assert token.text
def normalize_batch(p_iter, p_batch_size=1000, p_thread_count=5): """Normalize and tokenize strings. Args: p_iter (iter): iter over strings to normalize and tokenize. p_batch_size (int): number of batches. p_thread_count (int): number of threads running. Returns: iter: iter over normalized and tokenized string. """ global NLP if not NLP: NLP = NlpEnglish(parser=False) output_iter = NLP.pipe(p_iter, \ batch_size=p_batch_size, \ n_threads=p_thread_count) for doc in output_iter: tokens = [str(w).strip().lower() for w in doc] yield ' '.join(tokens)
def identify_KEEL_LAID_in_text(text): nlp = English() doc = nlp(text) matcher = Matcher(nlp.vocab) tokens_in_doc_count = len(doc) # # START - spaCy patterns # matcher.add("KEEL_LAID", [[{ "LOWER": { "IN": ["kjølstrukk", "kjølstrukket"] } }]]) matcher.add("DATE", None, [{'IS_DIGIT': True, 'LENGTH': 4}]) # # END - spaCy patterns # result = [] for match_id, token_start, token_end in matcher(doc): match_id_as_string = nlp.vocab.strings[match_id] final_token_start = token_start final_token_end = token_end spacy_pattern_detection = doc[token_start:token_end] spacy_pattern_detection_as_lower_text = spacy_pattern_detection.text.lower( ) # # Expand? # if match_id_as_string == "DATE" and token_start > 0: # At this point, DATE is just a year string. Example: 2021 prev_word_1_token_number = token_start - 1 prev_word_1_token = doc[prev_word_1_token_number] if prev_word_1_token.text in ("januar", "februar", "mars", "april", "mai", "juni", "juli", "august", "september", "oktober", "november", "desember"): final_token_start = prev_word_1_token_number # expanding # Expand more? prev_word_2_token_number = token_start - 2 prev_word_2_token = doc[prev_word_2_token_number] prev_word_3_token_number = token_start - 3 prev_word_3_token = doc[prev_word_3_token_number] if prev_word_2_token.text == "." and is_int( prev_word_3_token.text): final_token_start = prev_word_3_token_number # expanding # # convert token_span to char_span. # char_span is needed to display correctly withdisplacy.render(). # span = doc[final_token_start:final_token_end] span_char_start = span[0].idx span_char_end = span[-1].idx + len(span[-1].text) # return result identified_entity = { 'start': span_char_start, 'end': span_char_end, 'label': match_id_as_string } result.append(identified_entity) # # Identify prefix or suffix # if final_token_start > 0: prev_word_1_token_number = final_token_start - 1 prev_word_1_token = doc[prev_word_1_token_number] if prev_word_1_token.text.lower() == "før": # Prefix detected. # # convert token_span to char_span. # char_span is needed to display correctly withdisplacy.render(). # span = doc[ prev_word_1_token_number:final_token_start] span_char_start = span[0].idx span_char_end = span[-1].idx + len(span[-1].text) # return result identified_entity = { 'start': span_char_start, 'end': span_char_end, 'label': "DATE_PREFIX" } result.append(identified_entity) if ((final_token_end + 1) < tokens_in_doc_count): next_word_1_token_number = final_token_end next_word_1_token = doc[next_word_1_token_number] next_word_2_token_number = final_token_end + 1 next_word_2_token = doc[next_word_2_token_number] if (next_word_1_token.text.lower() == "eller" and next_word_2_token.text.lower() == "senere"): # Suffix detected. # # convert token_span to char_span. # char_span is needed to display correctly withdisplacy.render(). # span = doc[next_word_1_token_number:( next_word_1_token_number + 2)] span_char_start = span[0].idx span_char_end = span[-1].idx + len(span[-1].text) # return result identified_entity = { 'start': span_char_start, 'end': span_char_end, 'label': "DATE_SUFFIX" } result.append(identified_entity) elif match_id_as_string == "KEEL_LAID": # # convert token_span to char_span. # char_span is needed to display correctly withdisplacy.render(). # span = doc[final_token_start:final_token_end] span_char_start = span[0].idx span_char_end = span[-1].idx + len(span[-1].text) # return result identified_entity = { 'start': span_char_start, 'end': span_char_end, 'label': match_id_as_string } result.append(identified_entity) return result
def test_overfitting_IO(): # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly nlp = English() tagger = nlp.add_pipe("tagger") train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize(get_examples=lambda: train_examples) assert tagger.model.get_dim("nO") == len(TAGS) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["tagger"] < 0.00001 # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) assert doc[0].tag_ == "N" assert doc[1].tag_ == "V" assert doc[2].tag_ == "J" assert doc[3].tag_ == "N" # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) assert doc2[0].tag_ == "N" assert doc2[1].tag_ == "V" assert doc2[2].tag_ == "J" assert doc2[3].tag_ == "N" # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "I like green eggs.", "Here is another one.", "I eat ham.", ] batch_deps_1 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([TAG]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([TAG]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) # Try to unlearn the first 'N' tag with negative annotation neg_ex = Example.from_dict(nlp.make_doc(test_text), {"tags": ["!N", "V", "J", "N"]}) for i in range(20): losses = {} nlp.update([neg_ex], sgd=optimizer, losses=losses) # test the "untrained" tag doc3 = nlp(test_text) assert doc3[0].tag_ != "N"
def test_overfitting_IO(use_upper): # Simple test to try and quickly overfit the NER component nlp = English() ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}}) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) for ent in annotations.get("entities"): ner.add_label(ent[2]) optimizer = nlp.initialize() for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["ner"] < 0.00001 # test the trained model test_text = "I like London." doc = nlp(test_text) ents = doc.ents assert len(ents) == 1 assert ents[0].text == "London" assert ents[0].label_ == "LOC" # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) doc2 = nlp2(test_text) ents2 = doc2.ents assert len(ents2) == 1 assert ents2[0].text == "London" assert ents2[0].label_ == "LOC" # Ensure that the predictions are still the same, even after adding a new label ner2 = nlp2.get_pipe("ner") assert ner2.model.attrs["has_upper"] == use_upper ner2.add_label("RANDOM_NEW_LABEL") doc3 = nlp2(test_text) ents3 = doc3.ents assert len(ents3) == 1 assert ents3[0].text == "London" assert ents3[0].label_ == "LOC" # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Just a sentence.", "Then one more sentence about London.", "Here is another one.", "I like London.", ] batch_deps_1 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([ENT_IOB]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([ENT_IOB]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) # test that kb_id is preserved test_text = "I like London and London." doc = nlp.make_doc(test_text) doc.ents = [Span(doc, 2, 3, label="LOC", kb_id=1234)] ents = doc.ents assert len(ents) == 1 assert ents[0].text == "London" assert ents[0].label_ == "LOC" assert ents[0].kb_id == 1234 doc = nlp.get_pipe("ner")(doc) ents = doc.ents assert len(ents) == 2 assert ents[0].text == "London" assert ents[0].label_ == "LOC" assert ents[0].kb_id == 1234 # ent added by ner has kb_id == 0 assert ents[1].text == "London" assert ents[1].label_ == "LOC" assert ents[1].kb_id == 0
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) from torch.utils.data.distributed import DistributedSampler from torch.nn.functional import softmax sys.path.append("../") from model.modeling_classification import BertForSequenceClassification, BertForTokenClassification from model.tokenization import BertTokenizer logger = logging.getLogger(__name__) MaskedTokenInstance = collections.namedtuple("MaskedTokenInstance", ["tokens", "info"]) MaskedItemInfo = collections.namedtuple( "MaskedItemInfo", ["current_pos", "sen_doc_pos", "sen_right_id", "doc_ground_truth"]) nlp = English() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) class InputFeatures(object): def __init__(self, input_ids, input_mask, segment_ids=None): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids class SC(nn.Module): def __init__(self, mask_rate, top_sen_rate,
def test_issue3209(): """Test issue that occurred in spaCy nightly where NER labels were being mapped to classes incorrectly after loading the model, when the labels were added using ner.add_label(). """ nlp = English() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) ner.add_label("ANIMAL") nlp.begin_training() move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] assert ner.move_names == move_names nlp2 = English() nlp2.add_pipe(nlp2.create_pipe("ner")) nlp2.from_bytes(nlp.to_bytes()) assert nlp2.get_pipe("ner").move_names == move_names
def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str): nlp = English() nlp_plain = English() # load both vec and hashvec tables with make_tempdir() as tmpdir: p = tmpdir / "test.hashvec" with open(p, "w") as fileh: fileh.write(floret_vectors_hashvec_str) convert_vectors(nlp, p, truncate=0, prune=-1, mode="floret") p = tmpdir / "test.vec" with open(p, "w") as fileh: fileh.write(floret_vectors_vec_str) convert_vectors(nlp_plain, p, truncate=0, prune=-1) word = "der" # ngrams: full padded word + padded 2-grams + padded 3-grams ngrams = nlp.vocab.vectors._get_ngrams(word) assert ngrams == ["<der>", "<d", "de", "er", "r>", "<de", "der", "er>"] # rows: 2 rows per ngram rows = OPS.xp.asarray( [ h % nlp.vocab.vectors.shape[0] for ngram in ngrams for h in nlp.vocab.vectors._get_ngram_hashes(ngram) ], dtype="uint32", ) assert_equal( OPS.to_numpy(rows), numpy.asarray([5, 6, 7, 5, 8, 2, 8, 9, 3, 3, 4, 6, 7, 3, 0, 2]), ) assert len(rows) == len(ngrams) * nlp.vocab.vectors.hash_count # all vectors are equivalent for plain static table vs. hash ngrams for word in nlp_plain.vocab.vectors: word = nlp_plain.vocab.strings.as_string(word) assert_almost_equal(nlp.vocab[word].vector, nlp_plain.vocab[word].vector, decimal=3) # every word has a vector assert nlp.vocab[word * 5].has_vector # n_keys is -1 for floret assert nlp_plain.vocab.vectors.n_keys > 0 assert nlp.vocab.vectors.n_keys == -1 # check that single and batched vector lookups are identical words = [s for s in nlp_plain.vocab.vectors] single_vecs = OPS.to_numpy( OPS.asarray([nlp.vocab[word].vector for word in words])) batch_vecs = OPS.to_numpy(nlp.vocab.vectors.get_batch(words)) assert_equal(single_vecs, batch_vecs) # an empty key returns 0s assert_equal( OPS.to_numpy(nlp.vocab[""].vector), numpy.zeros((nlp.vocab.vectors.shape[0], )), ) # an empty batch returns 0s assert_equal( OPS.to_numpy(nlp.vocab.vectors.get_batch([""])), numpy.zeros((1, nlp.vocab.vectors.shape[0])), ) # an empty key within a batch returns 0s assert_equal( OPS.to_numpy(nlp.vocab.vectors.get_batch(["a", "", "b"])[1]), numpy.zeros((nlp.vocab.vectors.shape[0], )), ) # the loaded ngram vector table cannot be modified # except for clear: warning, then return without modifications vector = list(range(nlp.vocab.vectors.shape[1])) orig_bytes = nlp.vocab.vectors.to_bytes(exclude=["strings"]) with pytest.warns(UserWarning): nlp.vocab.set_vector("the", vector) assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"]) with pytest.warns(UserWarning): nlp.vocab[word].vector = vector assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"]) with pytest.warns(UserWarning): nlp.vocab.vectors.add("the", row=6) assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"]) with pytest.warns(UserWarning): nlp.vocab.vectors.resize(shape=(100, 10)) assert orig_bytes == nlp.vocab.vectors.to_bytes(exclude=["strings"]) with pytest.raises(ValueError): nlp.vocab.vectors.clear() # data and settings are serialized correctly with make_tempdir() as d: nlp.vocab.to_disk(d) vocab_r = Vocab() vocab_r.from_disk(d) assert nlp.vocab.vectors.to_bytes() == vocab_r.vectors.to_bytes() assert_equal(OPS.to_numpy(nlp.vocab.vectors.data), OPS.to_numpy(vocab_r.vectors.data)) assert_equal(nlp.vocab.vectors._get_cfg(), vocab_r.vectors._get_cfg()) assert_almost_equal( OPS.to_numpy(nlp.vocab[word].vector), OPS.to_numpy(vocab_r[word].vector), decimal=6, )
#import packages from gensim.summarization import keywords from matplotlib import pyplot import spacy import nltk from nltk.corpus import stopwords nltk.download('stopwords') stopwords = set(stopwords.words('english')) from spacy.lang.en import English nlp = English() nlp.max_length = 10000000 import lyricsgenius import pandas as pd from textblob import TextBlob import requests, json import numpy as np import matplotlib.ticker as plticker import seaborn as sns import time from datetime import date import matplotlib.dates as mdates #define Genius API authentication api_key = 'SfbYPF1AJ0-lnm6Km8_sIJoebvrIFfRyAGoZqxnRfkZIvP5ceGwBNZa4g0DHayP-' genius = lyricsgenius.Genius(api_key) BASE_URL = "https://api.genius.com" def main(): artist_name = "" while True:
def test_partial_links(): # Test that having some entities on the doc without gold links, doesn't crash TRAIN_DATA = [( "Russ Cochran his reprints include EC Comics.", { "links": { (0, 12): { "Q2146908": 1.0 } }, "entities": [(0, 12, "PERSON")], "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0], }, )] nlp = English() vector_length = 3 train_examples = [] for text, annotation in TRAIN_DATA: doc = nlp(text) train_examples.append(Example.from_dict(doc, annotation)) def create_kb(vocab): # create artificial KB mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9]) return mykb # Create and train the Entity Linker entity_linker = nlp.add_pipe("entity_linker", last=True) entity_linker.set_kb(create_kb) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) # adding additional components that are required for the entity_linker nlp.add_pipe("sentencizer", first=True) patterns = [ { "label": "PERSON", "pattern": [{ "LOWER": "russ" }, { "LOWER": "cochran" }] }, { "label": "ORG", "pattern": [{ "LOWER": "ec" }, { "LOWER": "comics" }] }, ] ruler = nlp.add_pipe("entity_ruler", before="entity_linker") ruler.add_patterns(patterns) # this will run the pipeline on the examples and shouldn't crash results = nlp.evaluate(train_examples) assert "PERSON" in results["ents_per_type"] assert "PERSON" in results["nel_f_per_type"] assert "ORG" in results["ents_per_type"] assert "ORG" not in results["nel_f_per_type"]
def test_overfitting_IO(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() vector_length = 3 assert "Q2146908" not in nlp.vocab.strings # Convert the texts to docs to make sure we have doc.ents set for the training examples train_examples = [] for text, annotation in TRAIN_DATA: doc = nlp(text) train_examples.append(Example.from_dict(doc, annotation)) def create_kb(vocab): # create artificial KB - assign same prior weight to the two russ cochran's # Q2146908 (Russ Cochran): American golfer # Q7381115 (Russ Cochran): publisher mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5], ) return mykb # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.add_pipe("entity_linker", last=True) assert isinstance(entity_linker, EntityLinker) entity_linker.set_kb(create_kb) assert "Q2146908" in entity_linker.vocab.strings assert "Q2146908" in entity_linker.kb.vocab.strings # train the NEL pipe optimizer = nlp.initialize(get_examples=lambda: train_examples) assert entity_linker.model.get_dim("nO") == vector_length assert entity_linker.model.get_dim( "nO") == entity_linker.kb.entity_vector_length for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["entity_linker"] < 0.001 # adding additional components that are required for the entity_linker nlp.add_pipe("sentencizer", first=True) # Add a custom component to recognize "Russ Cochran" as an entity for the example training data patterns = [{ "label": "PERSON", "pattern": [{ "LOWER": "russ" }, { "LOWER": "cochran" }] }] ruler = nlp.add_pipe("entity_ruler", before="entity_linker") ruler.add_patterns(patterns) # test the trained model predictions = [] for text, annotation in TRAIN_DATA: doc = nlp(text) for ent in doc.ents: predictions.append(ent.kb_id_) assert predictions == GOLD_entities # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) assert nlp2.pipe_names == nlp.pipe_names assert "Q2146908" in nlp2.vocab.strings entity_linker2 = nlp2.get_pipe("entity_linker") assert "Q2146908" in entity_linker2.vocab.strings assert "Q2146908" in entity_linker2.kb.vocab.strings predictions = [] for text, annotation in TRAIN_DATA: doc2 = nlp2(text) for ent in doc2.ents: predictions.append(ent.kb_id_) assert predictions == GOLD_entities # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions texts = [ "Russ Cochran captured his first major title with his son as caddie.", "Russ Cochran his reprints include EC Comics.", "Russ Cochran has been publishing comic art.", "Russ Cochran was a member of University of Kentucky's golf team.", ] batch_deps_1 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)] batch_deps_2 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)] no_batch_deps = [ doc.to_array([ENT_KB_ID]) for doc in [nlp(text) for text in texts] ] assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps)
def test_issue7065_b(): # Test that the NEL doesn't crash when an entity crosses a sentence boundary nlp = English() vector_length = 3 nlp.add_pipe("sentencizer") text = "Mahler 's Symphony No. 8 was beautiful." entities = [(0, 6, "PERSON"), (10, 24, "WORK")] links = { (0, 6): { "Q7304": 1.0, "Q270853": 0.0 }, (10, 24): { "Q7304": 0.0, "Q270853": 1.0 }, } sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0] doc = nlp(text) example = Example.from_dict(doc, { "entities": entities, "links": links, "sent_starts": sent_starts }) train_examples = [example] def create_kb(vocab): # create artificial KB mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( alias="No. 8", entities=["Q270853"], probabilities=[1.0], ) mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias( alias="Mahler", entities=["Q7304"], probabilities=[1.0], ) return mykb # Create the Entity Linker component and add it to the pipeline entity_linker = nlp.add_pipe("entity_linker", last=True) entity_linker.set_kb(create_kb) # train the NEL pipe optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) # Add a custom rule-based component to mimick NER patterns = [ { "label": "PERSON", "pattern": [{ "LOWER": "mahler" }] }, { "label": "WORK", "pattern": [ { "LOWER": "symphony" }, { "LOWER": "no" }, { "LOWER": "." }, { "LOWER": "8" }, ], }, ] ruler = nlp.add_pipe("entity_ruler", before="entity_linker") ruler.add_patterns(patterns) # test the trained model - this should not throw E148 doc = nlp(text) assert doc
def nlp(): return English()
def test_no_gold_ents(patterns): # test that annotating components work TRAIN_DATA = [( "Kirby is pink", { "links": { (0, 5): { "Q613241": 1.0 } }, "entities": [(0, 5, "CHARACTER")], "sent_starts": [1, 0, 0], }, )] nlp = English() vector_length = 3 train_examples = [] for text, annotation in TRAIN_DATA: doc = nlp(text) train_examples.append(Example.from_dict(doc, annotation)) # Create a ruler to mark entities ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns(patterns) # Apply ruler to examples. In a real pipeline this would be an annotating component. for eg in train_examples: eg.predicted = ruler(eg.predicted) def create_kb(vocab): # create artificial KB mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias("Kirby", ["Q613241"], [0.9]) # Placeholder mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5]) mykb.add_alias("pink", ["pink"], [0.9]) return mykb # Create and train the Entity Linker entity_linker = nlp.add_pipe("entity_linker", config={"use_gold_ents": False}, last=True) entity_linker.set_kb(create_kb) assert entity_linker.use_gold_ents is False optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) # adding additional components that are required for the entity_linker nlp.add_pipe("sentencizer", first=True) # this will run the pipeline on the examples and shouldn't crash nlp.evaluate(train_examples)
def test_nel_to_bytes(): # Test that a pipeline with an EL component can be converted to bytes def create_kb(vocab): kb = KnowledgeBase(vocab, entity_vector_length=3) kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) return kb nlp_1 = English() nlp_1.add_pipe("ner") entity_linker_1 = nlp_1.add_pipe("entity_linker", last=True) entity_linker_1.set_kb(create_kb) assert entity_linker_1.kb.contains_alias("Russ Cochran") assert nlp_1.pipe_names == ["ner", "entity_linker"] nlp_bytes = nlp_1.to_bytes() nlp_2 = English() nlp_2.add_pipe("ner") nlp_2.add_pipe("entity_linker", last=True) assert nlp_2.pipe_names == ["ner", "entity_linker"] assert not nlp_2.get_pipe("entity_linker").kb.contains_alias( "Russ Cochran") nlp_2 = nlp_2.from_bytes(nlp_bytes) kb_2 = nlp_2.get_pipe("entity_linker").kb assert kb_2.contains_alias("Russ Cochran") assert kb_2.get_vector("Q2146908") == [6, -4, 3] assert_almost_equal( kb_2.get_prior_prob(entity="Q2146908", alias="Russ Cochran"), 0.8)
def __init__(self): self.nlp = English()
def create_spacy_tokenizer(): nlp = English() sentencizer = nlp.create_pipe('sentencizer') nlp.add_pipe(sentencizer)
response = sound_stuff.start_transcription_job(job_name, object_url, 'mp3') print(response) print('...done') print('Waiting on transcription task...') sound_stuff.wait_for_transaction_job(job_name) print('...done') print("Loading Text File...") text = sound_stuff.load_transcript_from_job(job_name) print("...done") print("Extracting sentences...") from spacy.lang.en import English nlp = English() sbd = nlp.create_pipe('sentencizer') nlp.add_pipe(sbd) doc = nlp(text) sentences = [sentence.text for sentence in doc.sents] print("...done") file_utils.write_json('sentences.json', sentences, 3) print("Loading summarizer...") summarizer = models.get_summarizer_model() print("...done") print("Summarizing...") summary_indices = {}
from spacy.lang.en import English nlp = English() # Importe a classe Doc from ____ import ____ # Texto desejado: "spaCy is cool!" words = ["spaCy", "is", "cool", "!"] spaces = [True, True, False, False] # Crie um Doc a partir das palavras words e o espaçamento spaces doc = ____(____, words=words, spaces=spaces) print(doc.text)
def test_issue3289(): """Test that Language.to_bytes handles serializing a pipeline component with an uninitialized model.""" nlp = English() nlp.add_pipe(nlp.create_pipe("textcat")) bytes_data = nlp.to_bytes() new_nlp = English() new_nlp.add_pipe(nlp.create_pipe("textcat")) new_nlp.from_bytes(bytes_data)
from spacy.lang.en import English from spacy.matcher import PhraseMatcher from spacy.tokens import Span import json with open("exercises/countries.json") as f: COUNTRIES = json.loads(f.read()) with open("exercises/country_text.txt") as f: TEXT = f.read() nlp = English() matcher = PhraseMatcher(nlp.vocab) patterns = list(nlp.pipe(COUNTRIES)) matcher.add("COUNTRY", None, *patterns) # Create a doc and find matches in it doc = nlp(TEXT) # Iterate over the matches for match_id, start, end in matcher(doc): # Create a Span with the label for "GPE" span = Span(doc, start, end, label="GPE") # Overwrite the doc.ents and add the span doc.ents = list(doc.ents) + [span] # Get the span's root head token span_root_head = span.root.head # Print the text of the span root's head token and the span text print(span_root_head.text, "-->", span.text)
def getSentences(text): nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) document = nlp(text) return [sent.string.strip() for sent in document.sents]
def read_corpus(path, path_to_translations=None, path_to_translatable_ids=None, path_to_generated_questions=None): translations = {} translate_count = 0 if path_to_translations is not None: # INITIALIZE TOKENIZER FOR TRANSLATIONS # import sys # sys.setdefaultencoding("utf-8") import spacy nlp = spacy.load('en') from spacy.lang.en import English tokenizer = English().Defaults.create_tokenizer(nlp) def fun_proc(t): t = tokenizer(u'{}'.format(t.decode("utf-8"))) t = ' '.join([str(i) for i in t]).lower().strip() return t # STORE TRANSLATIONS IN A DICT fopen = gzip.open if path_to_translations.endswith(".gz") else open with fopen(path_to_translations) as f: for l in f: if len(l.strip()) > 0: i, t, b = l.strip().split('\t') i, t = i.strip(), t.strip() translations[i] = fun_proc(t) if path_to_translatable_ids is not None: # READ TRANSLATABLE IDS AND POP ALL THE OTHER IDS FROM TRANSLATIONS translatables = [] with open(path_to_translatable_ids, 'r') as fid: for l in fid: if len(l.strip()) > 0: i = l.strip().split()[0] translatables.append(i) translations_keys = translations.keys() for i in translations_keys: if i not in translatables: translations.pop(i) raw_corpus = {} # AR edit. # We add all the generated titles as additional items with id <orig-id>_qgen # Later we imply a truth label for the pairs (<orig-id>, <orig-id>_qgen) if path_to_generated_questions is not None: with open(path_to_generated_questions) as f: for l in f: qid, dist, q = l.strip().split('\t') key = '{}_qgen'.format(qid) assert (key not in raw_corpus) raw_corpus[key] = q.strip().split(), [] print('Read {} generated questions'.format(len(raw_corpus))) empty_cnt = 0 fopen = gzip.open if path.endswith(".gz") else open with fopen(path) as fin: for line in fin: id, title, body = line.split("\t") if len(title) == 0: print(id) empty_cnt += 1 continue if id in translations: translate_count += 1 title = translations[id] title = title.strip().split() body = body.strip().split() raw_corpus[id] = (title, body) say("{} empty titles ignored.\n".format(empty_cnt)) say("{} titles translated.\n".format(translate_count)) return raw_corpus
def load_sentencizer_and_tokenizer(): nlp = English() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) tokenizer = nlp.Defaults.create_tokenizer(nlp) return nlp, tokenizer
def create_api_response_for_post_identify_build_date_in_text_service_english_chapter_input( title_dictionary, forward_filtered_result_with_only_the_things_we_are_looking_for): nlp = English() forward_result = [] ### TEMP AREA temp_detection_dictionary = {} temp_check_before_reset = {} ### for line in forward_filtered_result_with_only_the_things_we_are_looking_for: # new line and temp reset temp_detection_dictionary.clear() temp_check_before_reset.clear() # Get NLP data from line text_service_url = line['title'] text = line['text'] ents = line['ents'] # discovered enteties in the line last_index_number_of_ents = len(ents) - 1 doc = nlp(text) # Get metadata from URL result_text_service_url = get_data_from_text_service_item_url( text_service_url) metadata_from_url = {} if "regulation_year" in result_text_service_url: metadata_from_url['regulation_year'] = result_text_service_url[ 'regulation_year'] if "regulation_month" in result_text_service_url: metadata_from_url['regulation_month'] = result_text_service_url[ 'regulation_month'] if "regulation_day" in result_text_service_url: metadata_from_url['regulation_day'] = result_text_service_url[ 'regulation_day'] if "regulation_id" in result_text_service_url: metadata_from_url['regulation_id'] = result_text_service_url[ 'regulation_id'] if "chapter_number" in result_text_service_url: metadata_from_url['chapter_number'] = result_text_service_url[ 'chapter_number'] if "section_number" in result_text_service_url: metadata_from_url['section_number'] = result_text_service_url[ 'section_number'] if "part_number" in result_text_service_url: metadata_from_url['part_number'] = result_text_service_url[ 'part_number'] if "sub_part_number" in result_text_service_url: metadata_from_url['sub_part_number'] = result_text_service_url[ 'sub_part_number'] # add chapter_title and section_title if "chapter_title" in title_dictionary: metadata_from_url['chapter_title'] = title_dictionary[ 'chapter_title'] if "section_title_in_dictionary" in title_dictionary: section_title_dictionary = title_dictionary[ 'section_title_in_dictionary'] if text_service_url in section_title_dictionary: metadata_from_url['section_title'] = section_title_dictionary[ text_service_url] # For each ent in line for ent_index_number, ent in enumerate(ents): ent_label = ent['label'] ent_start = ent['start'] ent_end = ent['end'] ent_text = text[ ent_start: ent_end] # same as: doc[ent_token_span.start:ent_token_span.end] ent_doc = nlp(ent_text) words_in_doc_count = len(ent_doc) ent_token_span = doc.char_span(ent_start, ent_end) ent_token_span_start = ent_token_span.start ent_token_span_end = ent_token_span.end #print(ent_text + " - " + ent_label + " (" + str(ent_token_span_start) + ":" + str(ent_token_span_end) + ")") # # Statment builder # if ent_label == "WATER_VESSEL": if "START_detected" not in temp_detection_dictionary: temp_detection_dictionary["START_detected"] = True else: # restart with new term temp_check_before_reset = dict(temp_detection_dictionary) temp_detection_dictionary.clear() temp_detection_dictionary["START_detected"] = True elif ent_label == "CONSTRUCT": if ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" not in temp_detection_dictionary): temp_detection_dictionary["CONSTRUCT_detected"] = True else: # reset temp_check_before_reset = dict(temp_detection_dictionary) temp_detection_dictionary.clear() elif ent_label == "DATE_PREFIX": if ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" in temp_detection_dictionary and "DATE_PREFIX_value" not in temp_detection_dictionary and "DATE_value_1" not in temp_detection_dictionary and "DATE_SEPARATOR_value" not in temp_detection_dictionary and "DATE_value_2" not in temp_detection_dictionary): temp_detection_dictionary["DATE_PREFIX_value"] = ent_text else: # reset temp_check_before_reset = dict(temp_detection_dictionary) temp_detection_dictionary.clear() elif ent_label == "DATE": if ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" in temp_detection_dictionary and "DATE_value_1" not in temp_detection_dictionary and "DATE_value_1_token_end" not in temp_detection_dictionary): temp_detection_dictionary["DATE_value_1"] = ent_text temp_detection_dictionary[ "DATE_value_1_token_end"] = ent_token_span_end elif ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" in temp_detection_dictionary and "DATE_value_1" in temp_detection_dictionary and "DATE_SEPARATOR_value" in temp_detection_dictionary and "DATE_value_2" not in temp_detection_dictionary): temp_detection_dictionary['DATE_value_2'] = ent_text # because this is the last value in a statment: temp_check_before_reset = dict(temp_detection_dictionary) temp_detection_dictionary.clear() else: # reset temp_check_before_reset = dict(temp_detection_dictionary) temp_detection_dictionary.clear() elif ent_label == "DATE_SEPARATOR": if ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" in temp_detection_dictionary and "DATE_value_1" in temp_detection_dictionary and "DATE_value_1_token_end" in temp_detection_dictionary and "DATE_SEPARATOR_value" not in temp_detection_dictionary): # Q: Is the separator the next term after value 1? if temp_detection_dictionary[ "DATE_value_1_token_end"] == ent_token_span_start: # A: Yes, this separator is the first word after value 1 temp_detection_dictionary[ "DATE_SEPARATOR_value"] = ent_text else: # reset # A: No. Reject value and reset. temp_check_before_reset = dict( temp_detection_dictionary) temp_detection_dictionary.clear() else: # reset temp_check_before_reset = dict(temp_detection_dictionary) temp_detection_dictionary.clear() # # Statment concluder # Q: Do we have what we need to build a statment? # # The statment builder have restarted. # Check what we have for a statment before continuing. if len(temp_check_before_reset) > 0: # If we have a double value statement if ("START_detected" in temp_check_before_reset and "CONSTRUCT_detected" in temp_check_before_reset and "DATE_value_1" in temp_check_before_reset and "DATE_SEPARATOR_value" in temp_check_before_reset and "DATE_value_2" in temp_check_before_reset): detection_with_url_metadata = dict(metadata_from_url) if "DATE_PREFIX_value" in temp_check_before_reset: detection_with_url_metadata[ "date_context"] = temp_check_before_reset[ "DATE_PREFIX_value"] detection_with_url_metadata[ "date_value_1"] = temp_check_before_reset[ "DATE_value_1"] detection_with_url_metadata[ "date_separator"] = temp_check_before_reset[ "DATE_SEPARATOR_value"] detection_with_url_metadata[ "date_value_2"] = temp_check_before_reset[ "DATE_value_2"] forward_result.append(detection_with_url_metadata) # If we have a single value statment elif ("START_detected" in temp_check_before_reset and "CONSTRUCT_detected" in temp_check_before_reset and "DATE_value_1" in temp_check_before_reset): detection_with_url_metadata = dict(metadata_from_url) if "DATE_PREFIX_value" in temp_check_before_reset: detection_with_url_metadata[ "date_context"] = temp_check_before_reset[ "DATE_PREFIX_value"] detection_with_url_metadata[ "date_value_1"] = temp_check_before_reset[ "DATE_value_1"] forward_result.append(detection_with_url_metadata) temp_check_before_reset.clear() # Conclude on current detections if ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" in temp_detection_dictionary and "DATE_value_1" in temp_detection_dictionary and "DATE_SEPARATOR_value" in temp_detection_dictionary and "DATE_value_2" in temp_detection_dictionary): # we have a full statment. # add and reset. detection_with_url_metadata = dict(metadata_from_url) if "DATE_PREFIX_value" in temp_detection_dictionary: detection_with_url_metadata[ "date_context"] = temp_detection_dictionary[ "DATE_PREFIX_value"] detection_with_url_metadata[ "date_value_1"] = temp_detection_dictionary["DATE_value_1"] detection_with_url_metadata[ "date_separator"] = temp_detection_dictionary[ "DATE_SEPARATOR_value"] detection_with_url_metadata[ "date_value_2"] = temp_detection_dictionary["DATE_value_2"] forward_result.append(detection_with_url_metadata) temp_detection_dictionary.clear() else: # get next ent next_ent_index_number = ent_index_number + 1 next_ent_label = "" if next_ent_index_number <= last_index_number_of_ents: next_ent = ents[next_ent_index_number] next_ent_label = next_ent["label"] # Q: Do we have enough for a new statment? if ("START_detected" in temp_detection_dictionary and "CONSTRUCT_detected" in temp_detection_dictionary and "DATE_value_1" in temp_detection_dictionary): # A: Yes, we have enough for a new statment. # Is the next ent relevant? if ("DATE_SEPARATOR_value" not in temp_detection_dictionary and next_ent_label == "DATE_SEPARATOR"): continue # we want the next ent elif ("DATE_SEPARATOR_value" in temp_detection_dictionary and "DATE_value_2" not in temp_detection_dictionary): continue # we know that the next value is a date else: # add the statment and move on detection_with_url_metadata = dict(metadata_from_url) if "DATE_PREFIX_value" in temp_detection_dictionary: detection_with_url_metadata[ "date_context"] = temp_detection_dictionary[ "DATE_PREFIX_value"] detection_with_url_metadata[ "date_value_1"] = temp_detection_dictionary[ "DATE_value_1"] forward_result.append(detection_with_url_metadata) temp_detection_dictionary.clear() return forward_result
import json from spacy.matcher import Matcher from spacy.lang.en import English with open("exercises/iphone.json") as f: TEXTS = json.loads(f.read()) nlp = English() matcher = Matcher(nlp.vocab) pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}] pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}] matcher.add("GADGET", None, pattern1, pattern2) TRAINING_DATA = [] # Create a Doc object for each text in TEXTS for doc in nlp.pipe(TEXTS): # Match on the doc and create a list of matched spans spans = [doc[start:end] for match_id, start, end in matcher(doc)] # Get (start character, end character, label) tuples of matches entities = [(span.start_char, span.end_char, "GADGET") for span in spans] # Format the matches as a (doc.text, entities) tuple training_example = (doc.text, {"entities": entities}) # Append the example to the training data TRAINING_DATA.append(training_example) print(*TRAINING_DATA, sep="\n")
def test_tagger_requires_labels(): nlp = English() nlp.add_pipe("tagger") with pytest.raises(ValueError): nlp.initialize()
from gensim import corpora import gensim import nltk import string import spacy import en_core_web_sm filepath = 'Enter filepath' filename = 'Enter filename' dataframe = pd.read_csv(filepath + filename) print(len(dataframe)) import re from spacy.lang.en import English parser = English() #Creating tokens by removing stopwords, punctuation using SpaCy def tokenize(text): lda_tokens = [] tokens = parser(text) stop_list = [ '<', '>', '</i>', '<i>', '<b>', '</b>', '=', '<i', '<b', '</i', '</b', '<sub>', '</sub>', '<sub' ] parser.Defaults.stop_words.update(stop_list) #print(tokens) for token in tokens: if token.orth_.isspace(): continue
def test_beam_overfitting_IO(neg_key): # Simple test to try and quickly overfit the Beam NER component nlp = English() beam_width = 16 beam_density = 0.0001 config = { "beam_width": beam_width, "beam_density": beam_density, "incorrect_spans_key": neg_key, } ner = nlp.add_pipe("beam_ner", config=config) train_examples = [] for text, annotations in TRAIN_DATA: train_examples.append( Example.from_dict(nlp.make_doc(text), annotations)) for ent in annotations.get("entities"): ner.add_label(ent[2]) optimizer = nlp.initialize() # run overfitting for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) assert losses["beam_ner"] < 0.0001 # test the scores from the beam test_text = "I like London" docs = [nlp.make_doc(test_text)] beams = ner.predict(docs) entity_scores = ner.scored_ents(beams)[0] assert entity_scores[(2, 3, "LOC")] == 1.0 assert entity_scores[(2, 3, "PERSON")] == 0.0 assert len(nlp(test_text).ents) == 1 # Also test the results are still the same after IO with make_tempdir() as tmp_dir: nlp.to_disk(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir) docs2 = [nlp2.make_doc(test_text)] ner2 = nlp2.get_pipe("beam_ner") beams2 = ner2.predict(docs2) entity_scores2 = ner2.scored_ents(beams2)[0] assert entity_scores2[(2, 3, "LOC")] == 1.0 assert entity_scores2[(2, 3, "PERSON")] == 0.0 # Try to unlearn the entity by using negative annotations neg_doc = nlp.make_doc(test_text) neg_ex = Example(neg_doc, neg_doc) neg_ex.reference.spans[neg_key] = [Span(neg_doc, 2, 3, "LOC")] neg_train_examples = [neg_ex] for i in range(20): losses = {} nlp.update(neg_train_examples, sgd=optimizer, losses=losses) # test the "untrained" model assert len(nlp(test_text).ents) == 0
with open(args.jsonlines_path, 'r') as f: lines = f.readlines() docs = [json.loads(line) for line in lines] tensor_examples, stored_info = data_processor.get_tensor_examples_from_custom_input(docs) predicted_clusters, _, _ = runner.predict(model, tensor_examples) if args.output_path: with open(args.output_path, 'w') as f: for i, doc in enumerate(docs): doc['predicted_clusters'] = predicted_clusters[i] f.write(json.dumps(doc) + "\n") #print(f'Saved prediction in {args.output_path}') else: # Interactive input model.to(model.device) nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) while True: input_str = str(input('Input document:')) bert_tokenizer, spacy_tokenizer = data_processor.tokenizer, nlp doc = get_document_from_string(input_str, args.seg_len, bert_tokenizer, nlp) tensor_examples, stored_info = data_processor.get_tensor_examples_from_custom_input([doc]) predicted_clusters, _, _ = runner.predict(model, tensor_examples) subtokens = util.flatten(doc['sentences']) #print('---Predicted clusters:') for cluster in predicted_clusters[0]: mentions_str = [' '.join(subtokens[m[0]:m[1]+1]) for m in cluster] mentions_str = [m.replace(' ##', '') for m in mentions_str] mentions_str = [m.replace('##', '') for m in mentions_str] #print(mentions_str) # Print out strings
import sys import datetime import spacy from spacy.lemmatizer import Lemmatizer from spacy.lang.en import English import pandas as pd import numpy as np import re from stop_words import STOP_WORDS nlp = spacy.load('en') tokenizer = English().Defaults.create_tokenizer(nlp) data = pd.read_csv('Data.csv', encoding='latin1') data = data['Text'][0:10] s = datetime.datetime.now() # remove stop words USING REGEX # remove_list = re.compile('[^a-zA-z0-9@ :\.\/]') # remove_list2 = re.compile('(\n|\.$|(\.?=\s+)|(:(?!\/\/)))') # remove_list3 = re.compile('(\s)(the|this|that|there|to|is|are|am|on|in|out|do|a|an|be|just|from|with|so|as|just|for|by|â€Â|)(?!\w)' ) # #txt = ' '.join(re.sub("[0-9]+","NUM",txt).split() # def remove_stop_word(data): # l = len(data) # for index in range(0,l): # data.loc[index] = re.sub(remove_list, '', (data.loc[index]).lower()) # data.loc[index] = re.sub(remove_list2, '', (data.loc[index]))
from spacy.lang.en import English nlp = English() people = ["David Bowie", "Angela Merkel", "Lady Gaga"] # Create a list of patterns for the PhraseMatcher patterns = list(nlp.pipe(people))
def identify_build_date_in_text(text): nlp = English() doc = nlp(text) matcher = Matcher(nlp.vocab) # # START - spaCy patterns # # WATER_VESSEL water_vessel_pattern = [{"LOWER": {"IN": ["vessels"]}}] matcher.add("WATER_VESSEL", None, water_vessel_pattern) # DATE matcher.add("DATE", None, [{'IS_DIGIT': True, 'LENGTH': 4}]) # CONSTRUCT matcher.add("CONSTRUCT", None, [{"LOWER": {"IN": ["constructed"]}}]) # # END - spaCy patterns # result = [] for match_id, token_start, token_end in matcher(doc): match_id_as_string = nlp.vocab.strings[match_id] final_token_start = token_start final_token_end = token_end if match_id_as_string == "DATE" and token_start > 0: # At this point, DATE is just a year string. Example: 2021 # Expand DATE? prev_word_1_token_number = token_start - 1 prev_word_1_token = doc[prev_word_1_token_number] if prev_word_1_token.text.lower() in ("january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"): final_token_start = prev_word_1_token_number # expanding # Expand more? prev_word_2_token_number = token_start - 2 prev_word_2_token = doc[prev_word_2_token_number] if is_int(prev_word_2_token.text): final_token_start = prev_word_2_token_number # expanding prev_word_on_date_token_number = final_token_start - 1 prev_word_on_date_token = doc[prev_word_on_date_token_number] # Does the DATE have a DATE_SEPARATOR? if prev_word_on_date_token.text in ("and", "to"): prev_word_on_date_char_span_start_number = prev_word_on_date_token.idx prev_word_on_date_char_span_end_number = prev_word_on_date_char_span_start_number + len( prev_word_on_date_token.text) identified_entity = { 'start': prev_word_on_date_char_span_start_number, 'end': prev_word_on_date_char_span_end_number, 'label': "DATE_SEPARATOR" } result.append(identified_entity) # Does the DATE have a DATE_SEPARATOR? elif prev_word_on_date_token.text in ("between", "before", "after"): # DATE_PREFIX detected prev_word_on_date_char_span_start_number = prev_word_on_date_token.idx prev_word_on_date_char_span_end_number = prev_word_on_date_char_span_start_number + len( prev_word_on_date_token.text) identified_entity = { 'start': prev_word_on_date_char_span_start_number, 'end': prev_word_on_date_char_span_end_number, 'label': "DATE_PREFIX" } result.append(identified_entity) # # convert token_span to char_span. # char_span is needed to display correctly withdisplacy.render(). # span = doc[final_token_start:final_token_end] span_char_start = span[0].idx span_char_end = span[-1].idx + len(span[-1].text) # return result identified_entity = { 'start': span_char_start, 'end': span_char_end, 'label': match_id_as_string } result.append(identified_entity) return result
#!/usr/bin/env python # coding: utf-8 # In[101]: import spacy # In[123]: nlp = spacy.load('en_core_web_sm') from spacy.lang.en import English nlp2 = English() # In[124]: def show_ents(doc): if doc.ents: for ent in doc.ents: print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_))) else: print('No entities found') # In[125]: doc = nlp(u'Hi how are you?') # In[126]: