def parser(vocab): vocab.strings.add("ROOT") config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = DependencyParser(vocab, model, **config) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 # parser.add_label('right') parser.add_label("left") parser.initialize(lambda: [_parser_example(parser)]) sgd = Adam(0.001) for i in range(10): losses = {} doc = Doc(vocab, words=["a", "b", "c", "d"]) example = Example.from_dict(doc, { "heads": [1, 1, 3, 3], "deps": ["left", "ROOT", "left", "ROOT"] }) parser.update([example], sgd=sgd, losses=losses) return parser
def main(train_loc, dev_loc, model_dir, tag_map_loc=None): if tag_map_loc: with open(tag_map_loc) as file_: tag_map = json.loads(file_.read()) else: tag_map = DEFAULT_TAG_MAP train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) if not (model_dir / 'deps').exists(): (model_dir / 'deps').mkdir() with (model_dir / 'deps' / 'config.json').open('wb') as file_: file_.write( json.dumps( {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] if tag_map: for tag in tags: assert tag in tag_map, repr(tag) tagger = Tagger(vocab, tag_map=tag_map) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) for itn in range(15): loss = 0. for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) loss += parser.update(doc, gold, itn=itn) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) nlp = Language(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def train_parser(nlp, train_data, left_labels, right_labels): parser = DependencyParser(nlp.vocab, left_labels=left_labels, right_labels=right_labels) for itn in range(1000): random.shuffle(train_data) loss = 0 for words, heads, deps in train_data: doc = Doc(nlp.vocab, words=words) gold = GoldParse(doc, heads=heads, deps=deps) loss += parser.update(doc, gold) parser.model.end_training() return parser
def parser(en_vocab): config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, "beam_width": 1, "beam_update_prob": 1.0, "beam_density": 0.0, } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = DependencyParser(en_vocab, model, **config) parser.add_label("nsubj") return parser
def train_parser(nlp, train_data, left_labels, right_labels): parser = DependencyParser( nlp.vocab, left_labels=left_labels, right_labels=right_labels) for itn in range(1000): random.shuffle(train_data) loss = 0 for words, heads, deps in train_data: doc = Doc(nlp.vocab, words=words) gold = GoldParse(doc, heads=heads, deps=deps) loss += parser.update(doc, gold) parser.model.end_training() return parser
def parser(vocab): parser = DependencyParser(vocab) parser.cfg["token_vector_width"] = 4 parser.cfg["hidden_width"] = 32 # parser.add_label('right') parser.add_label("left") parser.begin_training([], **parser.cfg) sgd = Adam(NumpyOps(), 0.001) for i in range(10): losses = {} doc = Doc(vocab, words=["a", "b", "c", "d"]) gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) parser.update([doc], [gold], sgd=sgd, losses=losses) return parser
def test_beam_parse(): nlp = Language() nlp.add_pipe(DependencyParser(nlp.vocab), name="parser") nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) doc = nlp.make_doc("Australia is a country") nlp.parser(doc, beam_width=2)
def test_get_oracle_actions(): ids, words, tags, heads, deps, ents = [], [], [], [], [], [] for id_, word, tag, head, dep, ent in annot_tuples: ids.append(id_) words.append(word) tags.append(tag) heads.append(head) deps.append(dep) ents.append(ent) doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = DependencyParser(doc.vocab, model) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") parser.moves.add_action(4, "ROOT") heads, deps = projectivize(heads, deps) for i, (head, dep) in enumerate(zip(heads, deps)): if head > i: parser.moves.add_action(2, dep) elif head < i: parser.moves.add_action(3, dep) example = Example.from_dict(doc, { "words": words, "tags": tags, "heads": heads, "deps": deps }) parser.moves.get_oracle_sequence(example)
def dependency_parse(sents): parser = DependencyParser(nlp.vocab) parsed_sents = [] for sent in sents: parsed_sents.append(spacy_to_tree(sent.root).pretty_print()) return parsed_sents
def main(train_loc, dev_loc, model_dir, tag_map_loc): with open(tag_map_loc) as file_: tag_map = json.loads(file_.read()) train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) with (model_dir / 'deps' / 'config.json').open('w') as file_: json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_) vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] for tag in tags: assert tag in tag_map, repr(tag) tagger = Tagger(vocab, tag_map=tag_map) parser = DependencyParser(vocab, actions=actions, features=features) for itn in range(15): for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) parser.update(doc, gold) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc)) nlp = Language(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def parser(vocab): config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = DependencyParser(vocab, model, **config) return parser
def dependency_parse(self): """ Dependency parser : Analyzes the grammatical structure of a sentence, establishing relationships between "head" words and words which modify those heads :return: """ spacy.load("en_core_web_sm") parser = DependencyParser(self.nlp.vocab) doc = self.nlp(self.text) processed = parser(doc) log.info("Dependency Parsing : " + processed) return processed
def train_spacy(data, iterations, model=None): TRAIN_DATA = pickle.load(open(data, 'rb')) nlp = spacy.blank('en') # create blank Language class # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2]) if model is None: optimizer = nlp.begin_training() else: print("Existing entities in the model are:", move_names) optimizer = nlp.entity.create_optimizer() # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(iterations): print("Starting iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: text, annotations = zip(*batch) nlp.update( text, # batch of texts annotations, # batch of annotations drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print(losses) #custom_ner_model = spacy.load(nlp) nlp_core_model = spacy.load("en_core_web_lg") tagger = Tagger(nlp_core_model.vocab) nlp.add_pipe(tagger, before="ner") parser = DependencyParser(nlp_core_model.vocab) nlp.add_pipe(parser, before="ner") nlp.begin_training() return TRAIN_DATA, nlp
def test_issue3830_with_subtok(): """Test that the parser does have subtok label if learn_tokens=True.""" config = { "learn_tokens": True, } model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels parser.initialize(lambda: [_parser_example(parser)]) assert "subtok" in parser.labels
def test_get_oracle_actions(): doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) parser = DependencyParser(doc.vocab) parser.moves.add_action(0, "") parser.moves.add_action(1, "") parser.moves.add_action(1, "") parser.moves.add_action(4, "ROOT") for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples): if head > i: parser.moves.add_action(2, dep) elif head < i: parser.moves.add_action(3, dep) ids, words, tags, heads, deps, ents = zip(*annot_tuples) heads, deps = projectivize(heads, deps) gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps) parser.moves.preprocess_gold(gold) parser.moves.get_oracle_sequence(doc, gold)
def test_issue3830_no_subtok(): """Test that the parser doesn't have subtok label if not learn_tokens""" config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } model = registry.resolve({"model": DEFAULT_PARSER_MODEL}, validate=True)["model"] parser = DependencyParser(Vocab(), model, **config) parser.add_label("nsubj") assert "subtok" not in parser.labels parser.initialize(lambda: [_parser_example(parser)]) assert "subtok" not in parser.labels
def parser(en_vocab): parser = DependencyParser(en_vocab) parser.add_label("nsubj") parser.model, cfg = parser.Model(parser.moves.n_moves) parser.cfg.update(cfg) return parser
def parser(vocab): parser = DependencyParser(vocab) return parser
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): LangClass = spacy.util.get_lang_class(lang_name) train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) if not model_dir.exists(): model_dir.mkdir() if not (model_dir / 'deps').exists(): (model_dir / 'deps').mkdir() if not (model_dir / 'pos').exists(): (model_dir / 'pos').mkdir() with (model_dir / 'deps' / 'config.json').open('wb') as file_: file_.write( json.dumps( {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) vocab = LangClass.Defaults.create_vocab() if not (model_dir / 'vocab').exists(): (model_dir / 'vocab').mkdir() else: if (model_dir / 'vocab' / 'strings.json').exists(): with (model_dir / 'vocab' / 'strings.json').open() as file_: vocab.strings.load(file_) if (model_dir / 'vocab' / 'lexemes.bin').exists(): vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') if clusters_loc is not None: clusters_loc = pathlib.Path(clusters_loc) with clusters_loc.open() as file_: for line in file_: try: cluster, word, freq = line.split() except ValueError: continue lex = vocab[word] lex.cluster = int(cluster[::-1], 2) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] if vocab.morphology.tag_map: for tag in tags: assert tag in vocab.morphology.tag_map, repr(tag) tagger = Tagger(vocab) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) for itn in range(30): loss = 0. for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) loss += parser.update(doc, gold, itn=itn) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def parser(vocab): cfg = {"model": DEFAULT_PARSER_MODEL} model = registry.resolve(cfg, validate=True)["model"] parser = DependencyParser(vocab, model) return parser
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): LangClass = spacy.util.get_lang_class(lang_name) train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) if not model_dir.exists(): model_dir.mkdir() if not (model_dir / 'deps').exists(): (model_dir / 'deps').mkdir() if not (model_dir / 'pos').exists(): (model_dir / 'pos').mkdir() with (model_dir / 'deps' / 'config.json').open('wb') as file_: file_.write( json.dumps( {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) vocab = LangClass.Defaults.create_vocab() if not (model_dir / 'vocab').exists(): (model_dir / 'vocab').mkdir() else: if (model_dir / 'vocab' / 'strings.json').exists(): with (model_dir / 'vocab' / 'strings.json').open() as file_: vocab.strings.load(file_) if (model_dir / 'vocab' / 'lexemes.bin').exists(): vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') if clusters_loc is not None: clusters_loc = pathlib.Path(clusters_loc) with clusters_loc.open() as file_: for line in file_: try: cluster, word, freq = line.split() except ValueError: continue lex = vocab[word] lex.cluster = int(cluster[::-1], 2) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] if vocab.morphology.tag_map: for tag in tags: assert tag in vocab.morphology.tag_map, repr(tag) tagger = Tagger(vocab) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) for itn in range(30): loss = 0. for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) loss += parser.update(doc, gold, itn=itn) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) nlp = Language(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def blank_parser(en_vocab): parser = DependencyParser(en_vocab) return parser