def test_pipe_factories_language_specific(): """Test that language sub-classes can have their own factories, with fallbacks to the base factories.""" name1 = "specific_component1" name2 = "specific_component2" Language.component(name1, func=lambda: "base") English.component(name1, func=lambda: "en") German.component(name2, func=lambda: "de") assert Language.has_factory(name1) assert not Language.has_factory(name2) assert English.has_factory(name1) assert not English.has_factory(name2) assert German.has_factory(name1) assert German.has_factory(name2) nlp = Language() assert nlp.create_pipe(name1)() == "base" with pytest.raises(ValueError): nlp.create_pipe(name2) nlp_en = English() assert nlp_en.create_pipe(name1)() == "en" with pytest.raises(ValueError): nlp_en.create_pipe(name2) nlp_de = German() assert nlp_de.create_pipe(name1)() == "base" assert nlp_de.create_pipe(name2)() == "de"
def test_issue3289(): """Test that Language.to_bytes handles serializing a pipeline component with an uninitialized model.""" nlp = English() nlp.add_pipe(nlp.create_pipe("textcat")) bytes_data = nlp.to_bytes() new_nlp = English() new_nlp.add_pipe(nlp.create_pipe("textcat")) new_nlp.from_bytes(bytes_data)
class SpacyTokenizer(): def __init__(self, lang='en'): """ Construct a spaCy-based tokenizer by loading the spaCy pipeline. """ if lang != 'en' and lang != "ru2": raise Exception( "spaCy tokenizer is currently only allowed in English pipeline." ) try: import spacy from spacy.lang.en import English except ImportError: raise ImportError( "spaCy 2.0+ is used but not installed on your machine. Go to https://spacy.io/usage for installation instructions." ) # Create a Tokenizer with the default settings for English # including punctuation rules and exceptions if lang == "ru2": self.nlp = spacy.load('ru2') else: self.nlp = English() # by default spacy uses dependency parser to do ssplit # we need to add a sentencizer for fast rule-based ssplit if lang == "ru": sentencizer = self.nlp.create_pipe('sentencizer', first=True) else: sentencizer = self.nlp.create_pipe('sentencizer') self.nlp.add_pipe(sentencizer) def tokenize(self, text): """ Tokenize a document with the spaCy tokenizer and wrap the results into a Doc object. """ if not isinstance(text, str): raise Exception("Must supply a string to the spaCy tokenizer.") spacy_doc = self.nlp(text) sentences = [] for sent in spacy_doc.sents: tokens = [] for tok in sent: token_entry = { doc.TEXT: tok.text, doc.MISC: f"{doc.START_CHAR}={tok.idx}|{doc.END_CHAR}={tok.idx+len(tok.text)}" } tokens.append(token_entry) sentences.append(tokens) return doc.Document(sentences, text)
def test_issue4707(): """Tests that disabled component names are also excluded from nlp.from_disk by default when loading a model. """ nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(nlp.create_pipe("entity_ruler")) assert nlp.pipe_names == ["sentencizer", "entity_ruler"] exclude = ["tokenizer", "sentencizer"] with make_tempdir() as tmpdir: nlp.to_disk(tmpdir, exclude=exclude) new_nlp = load_model_from_path(tmpdir, disable=exclude) assert "sentencizer" not in new_nlp.pipe_names assert "entity_ruler" in new_nlp.pipe_names
def test_issue4267(): """ Test that running an entity_ruler after ner gives consistent results""" nlp = English() ner = nlp.create_pipe("ner") ner.add_label("PEOPLE") nlp.add_pipe(ner) nlp.begin_training() assert "ner" in nlp.pipe_names # assert that we have correct IOB annotations doc1 = nlp("hi") assert doc1.is_nered for token in doc1: assert token.ent_iob == 2 # add entity ruler and run again ruler = EntityRuler(nlp) patterns = [{"label": "SOFTWARE", "pattern": "spacy"}] ruler.add_patterns(patterns) nlp.add_pipe(ruler) assert "entity_ruler" in nlp.pipe_names assert "ner" in nlp.pipe_names # assert that we still have correct IOB annotations doc2 = nlp("hi") assert doc2.is_nered for token in doc2: assert token.ent_iob == 2
def get_sentences(self): # Returns sentences from text nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated doc = nlp(self.text) sentences = [sent.string.strip() for sent in doc.sents] return sentences
def sentencize(self, input_string): """Produces a list of sentences""" nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) doc = nlp(input_string) sentences = [s.text.strip() for s in doc.sents if s.text.strip() != ''] return sentences
def createTextChunks(self, longString): import spacy from spacy.lang.en import English # Break into sentences after coref nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) # Chunk into sentences doc = nlp(longString, disable=['ner']) # ************* COREF BASE ******************** # Load your usual SpaCy model (one of SpaCy English models) # nlp = spacy.load('en_core_web_sm') # Add COREF # neuralcoref.add_to_pipe(nlp) # Perform parallel COREF for each sentence from above # coref_sentences = nlp.pipe([s.text[:4999] for s in doc.sents], disable=['ner']) # limiting to 4900 - after testing I find that # rows and rows of table data that are not sentences are what that doesn't get chunked # Hence forcing a manual chunk - there will be loss of information (TODO) #return [s._.coref_resolved[:4999] for s in coref_sentences] return [s.text[:4999] for s in doc.sents]
def test_issue5137(): class MyComponent(object): name = "my_component" def __init__(self, nlp, **cfg): self.nlp = nlp self.categories = cfg.get("categories", "all_categories") def __call__(self, doc): pass def to_disk(self, path, **kwargs): pass def from_disk(self, path, **cfg): pass Language.factories["my_component"] = lambda nlp, **cfg: MyComponent(nlp, **cfg) nlp = English() nlp.add_pipe(nlp.create_pipe("my_component")) assert nlp.get_pipe("my_component").categories == "all_categories" with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) nlp2 = spacy.load(tmpdir, categories="my_categories") assert nlp2.get_pipe("my_component").categories == "my_categories"
def split_into_sentences(text: str) -> List[Span]: nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) document = nlp(text) return list( filter(lambda s: not TextProcessingUtil.is_empty(s.text.strip()), document.sents))
def train(data_dir, save_dir): # load the Huggingface config, tokenizer, and model model_name = "clulab/roberta-timex-semeval" config = AutoConfig.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name, config=config, use_fast=True) model = AutoModelForTokenClassification.from_pretrained(model_name, config=config) # load the spacy sentence segmenter nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) # create a torch dataset from a directory of Anafora XML annotations and text files dataset = TimexDataset.from_texts(data_dir, nlp, tokenizer, config) # train and save the torch model trainer = Trainer( model=model, args=TrainingArguments(save_dir), train_dataset=dataset, data_collator=lambda features: dict( input_ids=torch.stack([f.input_ids for f in features]), attention_mask=torch.stack([f.attention_mask for f in features]), labels=torch.stack([f.label for f in features]))) trainer.train() trainer.save_model() tokenizer.save_pretrained(save_dir)
def split_text_to_sentences(raw_text): nlp = English() nlp.max_length = 12306482 nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated doc = nlp(raw_text) sentences = [sent.string.strip() for sent in doc.sents] return sentences
def predict(predict_dir, output_dir): # load the Huggingface config, tokenizer, and model model_name = "clulab/roberta-timex-semeval" config = AutoConfig.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name, config=config, use_fast=True) model = AutoModelForTokenClassification.from_pretrained(model_name, config=config) # load the spacy sentence segmenter nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) # create a torch dataset from a directory of text files dataset = TimexDataset.from_texts(predict_dir, nlp, tokenizer) # get predictions from the torch model trainer = Trainer( model=model, args=TrainingArguments("save_run/"), data_collator=lambda features: dict( input_ids=torch.stack([f.input_ids for f in features]), attention_mask=torch.stack([f.attention_mask for f in features]))) # write the predictions in Anafora XML format write_anafora(output_dir, dataset, predictions, tokenizer, config)
def autocorrect_line(self, line): ''' Takes in string as input, tokenizes and sentence segments it with spacy, then returns the concatenated result of calling autocorrect_sentence on all of the resulting sentence objects ''' nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) doc = nlp(line) sents = list(doc.sents) punc = [s[-1] for s in sents] # save end of sentence punctuation sents = [s[:-1] for s in sents] # get rid of end of sentence punctuation text = [] for i in range(len(sents)): if len(sents[i]) > 0: wordList = [t.text for t in sents[i]] wordList = [w.lower() for w in wordList] # get rid of capitalization wordList = [ ''.join(ch for ch in word if ch not in set(string.punctuation)) for word in wordList ] wordList = list(filter( lambda x: x != "", wordList)) # get rid of things that only consisted of punc checked = self.autocorrect_sentence(wordList) checked[-1] += str(punc[i]) # replace punctuation at end checked[0] = checked[0][0].upper() + checked[0][ 1:] # capitalize first character text.extend(checked) return text
def test_issue5458(): # Test that the noun chuncker does not generate overlapping spans # fmt: off words = [ "In", "an", "era", "where", "markets", "have", "brought", "prosperity", "and", "empowerment", "." ] vocab = Vocab(strings=words) deps = [ "ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc", "conj", "punct" ] pos = [ "ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ", "NOUN", "PUNCT" ] heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0] # fmt: on en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps) en_doc.noun_chunks_iterator = noun_chunks # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans" nlp = English() merge_nps = nlp.create_pipe("merge_noun_chunks") merge_nps(en_doc)
def test_issue6177(): """Test that after fixing the random seed, the results of the pipeline are truly identical""" # NOTE: no need to transform this code to v3 when 'master' is merged into 'develop'. # A similar test exists already for v3: test_issue5551 # This is just a backport results = [] for i in range(3): fix_random_seed(0) nlp = English() example = ( "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.", { "cats": { "Labe1": 1.0, "Label2": 0.0, "Label3": 0.0 } }, ) textcat = nlp.create_pipe("textcat") nlp.add_pipe(textcat) for label in set(example[1]["cats"]): textcat.add_label(label) nlp.begin_training() # Store the result of each iteration result = textcat.model.predict([nlp.make_doc(example[0])]) results.append(list(result[0])) # All results should be the same because of the fixed seed assert len(results) == 3 assert results[0] == results[1] assert results[0] == results[2]
def get_sentences(self): original_review = nltk.tokenize.treebank.TreebankWordDetokenizer().detokenize(self.X) nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) doc = nlp(original_review) sentences = [sent.string.strip() for sent in doc.sents] return sentences
def bulk_make_annotation_json(fns=[]): archive = load_archive(get_srl_model()) predictor = SemanticRoleLabelerPredictor.\ from_archive(archive, "semantic-role-labeling") jsons = [] for fn in fns: lines = [l.strip() for l in open(fn)] # Look for a year in the first line date = get_date_from_string(lines[0]) if date: # If the first line is a year, # then ignore that line and get # the rest of the doc as a string text = ' '.join(lines[1:]) else: text = ' '.join(lines) # Get spacy doc nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) doc = nlp(text) # Get sentencized text, and json format for AllenNLP sentences, json_sentences = doc2json(doc) srl_sents = predictor.predict_batch_json(json_sentences) #print(annotations2json(fn, sentences, srl_sents)) jsons.append(annotations2json(fn, date, sentences, srl_sents)) return jsons
def sent_segment(self, txt): """ sentence tokenization Parameters: txt : tex to tokenize into sentences Returns: list of sentences """ # Load English tokenizer, tagger, parser, NER and word vectors nlp = English() # A simple pipeline component, to allow custom sentence boundary detection logic # that doesn’t require the dependency parse. It splits on punctuation by default sbd = nlp.create_pipe('sentencizer') # Add the component to the pipeline nlp.add_pipe(sbd) #nlp is used to create documents with linguistic annotations. doc = nlp(txt) # create list of sentence tokens sents_list = [] for sent in doc.sents: sents_list.append(sent.text) return sents_list
class SpacyService(object): def __init__(self): spacy_model = Env.get_value(Env.SPACY_MODEL) if spacy_model == 'english': self.nlp = English() else: self.nlp = spacy.load(spacy_model) self.nlp.add_pipe(self.hashtag_pipe) self.nlp.add_pipe(self.nlp.create_pipe('sentencizer')) def hashtag_pipe(self, doc): '''Inspired by https://github.com/explosion/spaCy/issues/503 ''' i = 0 while i < len(doc) - 1: token = doc[i] if token.text == '#': if re.match(r'^\w+$', str(doc[i + 1])): with doc.retokenize() as retokenizer: retokenizer.merge(doc[i:i + 2]) i += 1 return doc def tokenizer(self, text: str): return self.nlp(text) def sentencizer(self, text: str): return self.nlp(text).sents
class SentenceHandler(object): def __init__(self, lang=English): if lang == "fr": self.nlp = French() else: self.nlp = English() self.nlp.add_pipe(self.nlp.create_pipe('sentencizer')) def process(self, body: str, min_length: int = 40, max_length: int = 600) -> List[str]: """ Processes the content sentences. :param body: The raw string body to process :param min_length: Minimum length that the sentences must be :param max_length: Max length that the sentences mus fall under :return: Returns a list of sentences. """ doc = self.nlp(body) return [ c.string.strip() for c in doc.sents if max_length > len(c.string.strip()) > min_length ] def __call__(self, body: str, min_length: int = 40, max_length: int = 600) -> List[str]: return self.process(body, min_length, max_length)
def test_issue3880(): """Test that `nlp.pipe()` works when an empty string ends the batch. Fixed in v7.0.5 of Thinc. """ texts = ["hello", "world", "", ""] nlp = English() nlp.add_pipe(nlp.create_pipe("parser")) nlp.add_pipe(nlp.create_pipe("ner")) nlp.add_pipe(nlp.create_pipe("tagger")) nlp.get_pipe("parser").add_label("dep") nlp.get_pipe("ner").add_label("PERSON") nlp.get_pipe("tagger").add_label("NN") nlp.begin_training() for doc in nlp.pipe(texts): pass
class Sentencizer: def __init__(self): self.nlp = English() self.nlp.add_pipe(self.nlp.create_pipe('sentencizer')) def split(self, text: str) -> List[str]: return [str(sent) for sent in self.nlp(text).sents]
def test_train_empty(): """Test that training an empty text does not throw errors.""" train_data = [ ("Who is Shaka Khan?", { "entities": [(7, 17, "PERSON")] }), ("", { "entities": [] }), ] nlp = English() ner = nlp.create_pipe("ner") ner.add_label("PERSON") nlp.add_pipe(ner, last=True) nlp.begin_training() for itn in range(2): losses = {} batches = minibatch(train_data) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations losses=losses, )
def predict(predict_dir, output_dir): model_name = "clulab/roberta-timex-semeval" config = AutoConfig.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name, config=config, use_fast=True) model = AutoModelForTokenClassification.from_pretrained(model_name, config=config) tokenizer = AutoTokenizer.from_pretrained( '/content/drive/My Drive/SFDA/Time/model/task2_model_APM/1/', use_fast=True) model = AutoModelForTokenClassification.from_pretrained( '/content/drive/My Drive/SFDA/Time/model/task2_model_APM/1/') print(3) # model.load_state_dict(torch.load('/content/drive/My Drive/SFDA/Time/model/model_wl_r0_e2.pt')) # load the spacy sentence segmenter nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) # create a torch dataset from a directory of text files dataset = TimexDataset.from_texts(predict_dir, nlp, tokenizer) # get predictions from the torch model trainer = Trainer( model=model, args=TrainingArguments("save_run/"), data_collator=lambda features: dict( input_ids=torch.stack([f.input_ids for f in features]), attention_mask=torch.stack([f.attention_mask for f in features]))) predictions, _, _ = trainer.predict(dataset) # write the predictions in Anafora XML format write_anafora(output_dir, dataset, predictions, tokenizer, config)
def test_sentence_tokenization(self): nlp = English() nlp.add_pipe(nlp.create_pipe('sentencizer')) doc = nlp("This is a sentence. This is another one.") sentences = [sent.text for sent in doc.sents] self.assertEqual(len(sentences), 2) self.assertEqual(sentences[0], "This is a sentence.")
def preprocessing(raw_clinical_note_file, sent_parsing=True, num_of_sen=100, num_of_sen_len=25): with open(raw_clinical_note_file, 'r') as file: raw_clinical_note = file.read() #set the tokenizer: retain only alphanumeric tokenizer = RegexpTokenizer(r'\w+') # original if sent_parsing: ##First: sentence tokenisation nlp = English() # just the language with no model sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) #rule-based sentencizer: .?! nlp.add_pipe(set_custom_boundaries) #add custom rules: \n\n #see https://spacy.io/usage/linguistic-features#sbd doc = nlp(raw_clinical_note) tokens = [] for i, sent_tokens in enumerate(doc.sents): ##Second: tokenisation same as in the original CAML-MIMIC step for tokens in each sentence list_token_str = [ t.lower() for t in tokenizer.tokenize(sent_tokens.text) if not t.isnumeric() ] ##Third: add all the tokens in all sentences together with sentence sign as dot if len(list_token_str) != 0: tokens = tokens + list_token_str + [ '.' ] # add tokens of sentences all together with sentence split sign as dot. clinical_note_tokenised = ' '.join(tokens) ##Forth: comine short sentences (length below 5) clinical_note_tokenised_combined = short_sentence_combined_with_previous_one( clinical_note_tokenised, length_threshold=10) ##Fifth: padding to 100 sentences and 25 tokens per sentence sentences = clinical_note_tokenised_combined.split(".") sen_n = len(sentences) padded_clinical_note = "" for i in range(num_of_sen): if i + 1 <= sen_n: # i starts from 0 padded_clinical_note = padded_clinical_note.strip( ) + " " + pad(sentences[i], num_of_sen_len) else: padded_clinical_note = padded_clinical_note.strip( ) + " " + pad("", num_of_sen_len) return padded_clinical_note else: #directly tokenise each word in the document #tokenize, lowercase and remove numerics tokens = [ t.lower() for t in tokenizer.tokenize(raw_clinical_note) if not t.isnumeric() ] preprocessed_clinical_note = '"' + ' '.join(tokens) + '"' return preprocessed_clinical_note
class ModelProcessor(object): def __init__(self, model='bert-large-uncased', hidden: int=-2, reduce_option: str = 'mean', greedyness: float=0.45): self.model = BertParent(model) self.hidden = hidden self.reduce_option = reduce_option self.nlp = English() self.nlp.add_pipe(self.nlp.create_pipe('sentencizer')) neuralcoref.add_to_pipe(self.nlp, greedyness=greedyness) def process_content_sentences(self, body: str, min_length=40, max_length=600) -> List[str]: doc = self.nlp(body)._.coref_resolved doc = self.nlp(doc) return [c.string.strip() for c in doc.sents if len(c.string.strip()) > min_length and len(c.string.strip()) < max_length] @abstractmethod def run_clusters(self, content: List[str], ratio=0.2, algorithm='kmeans', use_first: bool=True) -> List[str]: raise NotImplementedError("Must Implement run_clusters") def run(self, body: str, ratio: float=0.2, min_length: int=40, max_length: int=600, use_first: bool=True, algorithm='kmeans') -> str: sentences = self.process_content_sentences(body, min_length, max_length) if sentences: sentences = self.run_clusters(sentences, ratio, algorithm, use_first) return ' '.join(sentences) def __call__(self, body: str, ratio: float=0.2, min_length: int=40, max_length: int=600, use_first: bool=True, algorithm='kmeans') -> str: return self.run(body, ratio, min_length, max_length)
def test_issue4042_bug2(): """ Test that serialization of an NER works fine when new labels were added. This is the second bug of two bugs underlying the issue 4042. """ nlp1 = English() # add ner pipe ner1 = nlp1.add_pipe("ner") ner1.add_label("SOME_LABEL") nlp1.initialize() # add a new label to the doc doc1 = nlp1("What do you think about Apple ?") assert len(ner1.labels) == 1 assert "SOME_LABEL" in ner1.labels apple_ent = Span(doc1, 5, 6, label="MY_ORG") doc1.ents = list(doc1.ents) + [apple_ent] # Add the label explicitly. Previously we didn't require this. ner1.add_label("MY_ORG") ner1(doc1) assert len(ner1.labels) == 2 assert "SOME_LABEL" in ner1.labels assert "MY_ORG" in ner1.labels with make_tempdir() as d: # assert IO goes fine output_dir = ensure_path(d) if not output_dir.exists(): output_dir.mkdir() ner1.to_disk(output_dir) config = {} ner2 = nlp1.create_pipe("ner", config=config) ner2.from_disk(output_dir) assert len(ner2.labels) == 2
def get_sentences(text): nlp = English() sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) doc = nlp(text) return list(doc.sents)
def test_issue3209(): """Test issue that occurred in spaCy nightly where NER labels were being mapped to classes incorrectly after loading the model, when the labels were added using ner.add_label(). """ nlp = English() ner = nlp.create_pipe("ner") nlp.add_pipe(ner) ner.add_label("ANIMAL") nlp.begin_training() move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] assert ner.move_names == move_names nlp2 = English() nlp2.add_pipe(nlp2.create_pipe("ner")) nlp2.from_bytes(nlp.to_bytes()) assert nlp2.get_pipe("ner").move_names == move_names
def test_issue3449(): nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) text1 = "He gave the ball to I. Do you want to go to the movies with I?" text2 = "He gave the ball to I. Do you want to go to the movies with I?" text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" t1 = nlp(text1) t2 = nlp(text2) t3 = nlp(text3) assert t1[5].text == "I" assert t2[5].text == "I" assert t3[5].text == "I"
def test_issue3468(): """Test that sentence boundaries are set correctly so Doc.is_sentenced can be restored after serialization.""" nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) doc = nlp("Hello world") assert doc[0].is_sent_start assert doc.is_sentenced assert len(list(doc.sents)) == 1 doc_bytes = doc.to_bytes() new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) assert new_doc[0].is_sent_start assert new_doc.is_sentenced assert len(list(new_doc.sents)) == 1