def main(train_loc, dev_loc, model_dir, tag_map_loc=None): if tag_map_loc: with open(tag_map_loc) as file_: tag_map = json.loads(file_.read()) else: tag_map = DEFAULT_TAG_MAP train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) if not (model_dir / 'deps').exists(): (model_dir / 'deps').mkdir() with (model_dir / 'deps' / 'config.json').open('wb') as file_: file_.write( json.dumps( {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] if tag_map: for tag in tags: assert tag in tag_map, repr(tag) tagger = Tagger(vocab, tag_map=tag_map) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) for itn in range(15): loss = 0. for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) loss += parser.update(doc, gold, itn=itn) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) nlp = Language(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def main(train_loc, dev_loc, model_dir, tag_map_loc): with open(tag_map_loc) as file_: tag_map = json.loads(file_.read()) train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) with (model_dir / 'deps' / 'config.json').open('w') as file_: json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_) vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] for tag in tags: assert tag in tag_map, repr(tag) tagger = Tagger(vocab, tag_map=tag_map) parser = DependencyParser(vocab, actions=actions, features=features) for itn in range(15): for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) parser.update(doc, gold) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc)) nlp = Language(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): LangClass = spacy.util.get_lang_class(lang_name) train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) if not model_dir.exists(): model_dir.mkdir() if not (model_dir / 'deps').exists(): (model_dir / 'deps').mkdir() if not (model_dir / 'pos').exists(): (model_dir / 'pos').mkdir() with (model_dir / 'deps' / 'config.json').open('wb') as file_: file_.write( json.dumps( {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) vocab = LangClass.Defaults.create_vocab() if not (model_dir / 'vocab').exists(): (model_dir / 'vocab').mkdir() else: if (model_dir / 'vocab' / 'strings.json').exists(): with (model_dir / 'vocab' / 'strings.json').open() as file_: vocab.strings.load(file_) if (model_dir / 'vocab' / 'lexemes.bin').exists(): vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') if clusters_loc is not None: clusters_loc = pathlib.Path(clusters_loc) with clusters_loc.open() as file_: for line in file_: try: cluster, word, freq = line.split() except ValueError: continue lex = vocab[word] lex.cluster = int(cluster[::-1], 2) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] if vocab.morphology.tag_map: for tag in tags: assert tag in vocab.morphology.tag_map, repr(tag) tagger = Tagger(vocab) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) for itn in range(30): loss = 0. for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) loss += parser.update(doc, gold, itn=itn) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def arc_eager(vocab): moves = ArcEager(vocab.strings, ArcEager.get_actions()) moves.add_action(2, "left") moves.add_action(3, "right") return moves
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): LangClass = spacy.util.get_lang_class(lang_name) train_sents = list(read_conllx(train_loc)) train_sents = PseudoProjectivity.preprocess_training_data(train_sents) actions = ArcEager.get_actions(gold_parses=train_sents) features = get_templates('basic') model_dir = pathlib.Path(model_dir) if not model_dir.exists(): model_dir.mkdir() if not (model_dir / 'deps').exists(): (model_dir / 'deps').mkdir() if not (model_dir / 'pos').exists(): (model_dir / 'pos').mkdir() with (model_dir / 'deps' / 'config.json').open('wb') as file_: file_.write( json.dumps( {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8')) vocab = LangClass.Defaults.create_vocab() if not (model_dir / 'vocab').exists(): (model_dir / 'vocab').mkdir() else: if (model_dir / 'vocab' / 'strings.json').exists(): with (model_dir / 'vocab' / 'strings.json').open() as file_: vocab.strings.load(file_) if (model_dir / 'vocab' / 'lexemes.bin').exists(): vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') if clusters_loc is not None: clusters_loc = pathlib.Path(clusters_loc) with clusters_loc.open() as file_: for line in file_: try: cluster, word, freq = line.split() except ValueError: continue lex = vocab[word] lex.cluster = int(cluster[::-1], 2) # Populate vocab for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: for word in words: _ = vocab[word] for dep in deps: _ = vocab[dep] for tag in tags: _ = vocab[tag] if vocab.morphology.tag_map: for tag in tags: assert tag in vocab.morphology.tag_map, repr(tag) tagger = Tagger(vocab) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) for itn in range(30): loss = 0. for _, doc_sents in train_sents: for (ids, words, tags, heads, deps, ner), _ in doc_sents: doc = Doc(vocab, words=words) gold = GoldParse(doc, tags=tags, heads=heads, deps=deps) tagger(doc) loss += parser.update(doc, gold, itn=itn) doc = Doc(vocab, words=words) tagger.update(doc, gold) random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) nlp = Language(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def arc_eager(vocab): actions = ArcEager.get_actions(left_labels=["L"], right_labels=["R"]) return ArcEager(vocab.strings, actions)