def test_dynamic_store_size(): """Tests size of StringStore in vocab after creating doc""" me = hook.local_worker nlp = syfertext.load("en_core_web_lg", owner=me) # Check that no string is present in store assert len(nlp.vocab.store) == 0 doc = nlp("quick brown fox jumps") # Check that 4 strings have been added in store assert len(nlp.vocab.store) == 4
def test_number_of_subpipelines_created_with_pipes_of_different_remote_values( ): """Test the number of subpipelines created when we use pipe components with different remote values. """ nlp = syfertext.load("en_core_web_lg", owner=me) # Add the pipeline components to SyferText pipeline nlp.add_pipe(noun_tagger, name="noun tagger", remote=True) nlp.add_pipe(verb_tagger, name="verb tagger", remote=False) nlp.add_pipe(pronoun_tagger, name="pronoun tagger", remote=False) nlp.add_pipe(adjective_tagger, name="adjective tagger", remote=True) # Note: The tokenizer is added by default with remote = True # And the adjacent pipes with same remote value are grouped together # in a single dictionary. So the nlp.subpipeline_templates should be # # nlp.subpipeline_templates = [ {'remote': True, 'names': ['tokenizer','noun tagger']}, # {'remote': False, 'name': ['verb tagger', 'pronoun tagger']}, # {'remote': True, 'name': ['adjective tagger']} # ] remote_subpipelines = [ s for s in nlp.subpipeline_templates if (s["remote"] == True) ] local_subpipelines = [ s for s in nlp.subpipeline_templates if (s["remote"] == False) ] # Assert subpipeline_templates contains 3 subpipelines, # two with remote = True and one with remote = False assert len(nlp.subpipeline_templates) == 3 assert len(remote_subpipelines) == 2 assert len(local_subpipelines) == 1 # Assert the relative order of subpipelines in subpipeline templates for subpipeline, remote_value in zip(nlp.subpipeline_templates, [True, False, True]): assert subpipeline["remote"] == remote_value # Make sure the subpipelines contain the correct number of pipes for subpipeline, num_pipes in zip(nlp.subpipeline_templates, [2, 2, 1]): assert len(subpipeline["names"]) == num_pipes # Make sure subpipelines contains pipes in the correct order pipes = [["tokenizer", "noun tagger"], ["verb tagger", "pronoun tagger"], ["adjective tagger"]] for subpipeline, pipe_names in zip(nlp.subpipeline_templates, pipes): assert subpipeline["names"] == pipe_names
def test_pipeline_output(): nlp = syfertext.load("en_core_web_lg", owner=me) james = sy.VirtualWorker(hook, id="james") # Create a PySyft String and send it to remote worker james text_ptr = String("building SyferText").send(james) # Add tagger with remote = True tagger = SimpleTagger(attribute="noun", lookups=["SyferText"], tag=True) nlp.add_pipe(tagger, name="noun_tagger", remote=True) # Upon processing the text present on james's machine, # pipeline Should return a DocPointer to the doc on james's machine doc = nlp(text_ptr) assert isinstance(doc, DocPointer) # assert only one document object on james's machine documents = [v for v in james._objects.values() if isinstance(v, Doc)] assert len(documents) == 1 # assert returned doc_pointer points to document object on james's machine assert doc.id_at_location == documents[0].id # assert only one subpipeline object on james's machine subpipelines = [ v for v in james._objects.values() if isinstance(v, SubPipeline) ] assert len(subpipelines) == 1 # Make sure subpipeline object contains tokenizer and tagger pipes = subpipelines[0].pipe_names assert len(pipes) == 2 assert pipes[0] == "tokenizer" assert pipes[1] == "noun_tagger" # nlp.pipeline stores pointers to subpipeline objects on remote machines # assert subpipeline pointer stored in nlp.pipeline points to the subpipeline on james machine assert nlp.pipeline[0]["james"].id_at_location == subpipelines[0].id
def test_addition_and_removal_of_pipeline_components(): """Test the add_pipe and remove_pipe methods. """ nlp = syfertext.load("en_core_web_lg", owner=me) # Add the pipeline components to SyferText pipeline nlp.add_pipe(noun_tagger, name="noun tagger") nlp.add_pipe(verb_tagger, name="verb tagger") # Note: Tokenizer is always the first component in any pipeline and # is added by default to the nlp.pipeline_template. # So the current state of the nlp.pipeline_template should be like this # nlp.pipeline_template = [{'remote': True, 'name': 'tokenizer'}, # {'remote': False, 'name': 'noun tagger'}, # {'remote': False, 'name': 'verb tagger'}] assert len(nlp.pipeline_template) == 3 # Remove noun tagger from the pipeline nlp.remove_pipe(name="noun tagger") # Assert pipeline has two components assert len(nlp.pipeline_template) == 2
import syft as sy import torch import syfertext from syft.generic.string import String from syfertext.doc import Doc from syfertext.span import Span from syfertext.pointers.doc_pointer import DocPointer from syfertext.pointers.span_pointer import SpanPointer hook = sy.TorchHook(torch) me = hook.local_worker nlp = syfertext.load("en_core_web_lg", owner=me) def test_creation_of_basic_span(): """Test the __getitem__() method of doc returns a Span when passed in a slice.""" doc = nlp("the quick brown fox jumps over lazy dog") span = doc[1:5] actual_tokens = ["quick", "brown", "fox", "jumps"] assert len(span) == len(actual_tokens) for token, actual_token in zip(span, actual_tokens): assert token.text == actual_token
import syft as sy import torch import syfertext hook = sy.TorchHook(torch) me = hook.local_worker lang = "en_core_web_lg" nlp = syfertext.load(lang, owner=me) vocab = nlp.vocab def test_token_text_with_ws(): text = "Green Apple " doc = nlp(text) tok1 = doc[0] tok2 = doc[1] assert tok1.text_with_ws + tok2.text_with_ws == text def test_token_lex_id(): text = "apple" # create a token object token = nlp(text)[0] # Get the Lexeme object from vocab lexeme = vocab[text] # test that lexeme rank and token lex_id are equal assert token.lex_id == lexeme.rank
def test_subpipeline_is_not_recreated_in_remote_workers(): """Test that the a subpipeline at a given index is not recreated in remote workers after it has been initialized once. Each worker contains a single subpipeline, with multiple components. """ nlp = syfertext.load("en_core_web_lg", owner=me) alice = sy.VirtualWorker(hook, id="alice") bob = sy.VirtualWorker(hook, id="bob") # Create 4 PySyft Strings and send them to remote workers # (3 to Bob, 1 to Alice) texts = [String(text) for text in ["hello", "syfertext", "private", "nlp"]] texts_ptr = [ texts[0].send(bob), texts[1].send(bob), texts[2].send(alice), texts[3].send(bob) ] # The first time a text owned by `bob` is tokenized, a `SubPipeline` object is # created by the `nlp` object and sent to `bob`. The `nlp` object keeps a # pointer to the tokenizer object in `bob`'s machine. doc1 = nlp(texts_ptr[0]) subpipelines = [ v for v in bob._objects.values() if isinstance(v, SubPipeline) ] documents = [v for v in bob._objects.values() if isinstance(v, Doc)] assert len(subpipelines) == 1 assert len(documents) == 1 assert len(nlp.pipeline[0].keys()) == 1 assert "bob" in nlp.pipeline[0] # The second time a text owned by `bob` is tokenized, no new `SubPipeline` # objects are created. Only a new document on `bob`'s machine. doc2 = nlp(texts_ptr[1]) subpipelines = [ v for v in bob._objects.values() if isinstance(v, SubPipeline) ] documents = [v for v in bob._objects.values() if isinstance(v, Doc)] assert len(subpipelines) == 1 assert len(documents) == 2 assert len(nlp.pipeline[0].keys()) == 1 # The first time a text owned by `alice` is tokenized, a new `SubPipeline` object # is created by the `nlp` object and sent to `alice`. Now the `nlp` object has # a second pointer to a `SubPipeline`. doc3 = nlp(texts_ptr[2]) subpipelines = [ v for v in alice._objects.values() if isinstance(v, SubPipeline) ] documents = [v for v in alice._objects.values() if isinstance(v, Doc)] assert len(subpipelines) == 1 assert len(documents) == 1 assert len(nlp.pipeline[0].keys()) == 2 assert "alice" in nlp.pipeline[0] # The third time a text owned by `bob` is tokenized, no new `SupPipeline` # objects are created. The `nlp` object still has the same previous pointer # to a `SubPipeline` on `bob`'s machine and `bob` now has a third document. doc4 = nlp(texts_ptr[3]) subpipelines = [ v for v in bob._objects.values() if isinstance(v, SubPipeline) ] documents = [v for v in bob._objects.values() if isinstance(v, Doc)] assert len(subpipelines) == 1 assert len(documents) == 3 assert len(nlp.pipeline[0].keys()) == 2
# print for debugging purposes # print("mapping",example['label']," to ",one_hot_label) # Send the transcription label example['label'] = one_hot_label.send(worker) # Bob's remote dataset make_remote_dataset(train_bob, bob) make_remote_dataset(val_bob, bob) # Alice's remote dataset make_remote_dataset(train_alice, alice) make_remote_dataset(val_alice, alice) # Create a Language object with SyferText nlp = syfertext.load('en_core_web_lg', owner=me) use_stop_tagger = True use_vocab_tagger = True # Token with these custom tags # will be excluded from creating # the Doc vector excluded_tokens = {} ## Load the list of stop words with open('./data/clinical-stopwords.txt', 'r') as f: stop_words = set(f.read().splitlines()) # Create a simple tagger object to tag stop words stop_tagger = SimpleTagger(attribute='is_stop', lookups=stop_words, tag=True,