def test_hyphen_in_config(): hyphen_config_str = """ [nlp] lang = "en" pipeline = ["my_punctual_component"] [components] [components.my_punctual_component] factory = "my_punctual_component" punctuation = ["?","-"] """ @spacy.Language.factory("my_punctual_component") class MyPunctualComponent(object): name = "my_punctual_component" def __init__( self, nlp, name, punctuation, ): self.punctuation = punctuation nlp = English.from_config(load_config_from_str(hyphen_config_str)) assert nlp.get_pipe("my_punctual_component").punctuation == ['?', '-']
def test_issue6950(): """Test that the nlp object with initialized tok2vec with listeners pickles correctly (and doesn't have lambdas). """ nlp = English.from_config(load_config_from_str(CONFIG_ISSUE_6950)) nlp.initialize(lambda: [Example.from_dict(nlp.make_doc("hello"), {"tags": ["V"]})]) pickle.dumps(nlp) nlp("hello") pickle.dumps(nlp)
def test_issue7029(): """Test that an empty document doesn't mess up an entire batch.""" nlp = English.from_config(load_config_from_str(CONFIG)) train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] docs1 = list(nlp.pipe(texts, batch_size=1)) docs2 = list(nlp.pipe(texts, batch_size=4)) assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]
def test_issue6908(component_name): """Test intializing textcat with labels in a list""" def create_data(out_file): nlp = spacy.blank("en") doc = nlp.make_doc("Some text") doc.cats = {"label1": 0, "label2": 1} out_data = DocBin(docs=[doc]).to_bytes() with out_file.open("wb") as file_: file_.write(out_data) with make_tempdir() as tmp_path: train_path = tmp_path / "train.spacy" create_data(train_path) config_str = CONFIG_ISSUE_6908.replace("TEXTCAT_PLACEHOLDER", component_name) config_str = config_str.replace("TRAIN_PLACEHOLDER", train_path.as_posix()) config = load_config_from_str(config_str) init_nlp(config)
def test_empty_doc(): """Test that an empty document gets processed correctly """ nlp = English.from_config(load_config_from_str(CONFIG)) train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize(get_examples=lambda: train_examples) for i in range(2): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] # run as normal nlp.select_pipes(enable=["transformer", "tagger"]) docs1 = list(nlp.pipe(texts, batch_size=1)) docs2 = list(nlp.pipe(texts, batch_size=4)) assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] # disable the transformer (the listener will produce random output) nlp.select_pipes(enable=["tagger"]) docs1 = list(nlp.pipe(texts, batch_size=1)) docs2 = list(nlp.pipe(texts, batch_size=4)) assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]