def __init__(self, name=None, wf=None): if wf == None: wf = Workflow(name) self.wf = wf self.wiki = WikiWorkflow(wf=wf)
class EmbeddingWorkflow: def __init__(self, name=None, wf=None): if wf == None: wf = Workflow(name) self.wf = wf self.wiki = WikiWorkflow(wf=wf) #--------------------------------------------------------------------------- # Word embeddings #--------------------------------------------------------------------------- def vocabulary(self, language=None): """Resource for word embedding vocabulary. This is a text map with (normalized) words and counts. """ if language == None: language = flags.arg.language return self.wf.resource("word-vocabulary.map", dir=corpora.wikidir(language), format="textmap/word") def word_embeddings(self, language=None): """Resource for word embeddings in word2vec embedding format.""" if language == None: language = flags.arg.language return self.wf.resource("word-embeddings.vec", dir=corpora.wikidir(language), format="embeddings") def extract_vocabulary(self, documents=None, output=None, language=None): if language == None: language = flags.arg.language if documents == None: documents = self.wiki.wikipedia_documents(language) if output == None: output = self.vocabulary(language) with self.wf.namespace(language + "-vocabulary"): return self.wf.mapreduce(documents, output, format="message/word:count", mapper="word-embeddings-vocabulary-mapper", reducer="word-embeddings-vocabulary-reducer", params={"normalization": "dlw"}) def train_word_embeddings(self, documents=None, vocabulary=None, output=None, language=None): """Train word embeddings.""" if language == None: language = flags.arg.language if documents == None: documents = self.wiki.wikipedia_documents(language) if vocabulary == None: vocabulary = self.vocabulary(language) if output == None: output = self.word_embeddings(language) with self.wf.namespace(language + "-word-embeddings"): trainer = self.wf.task("word-embeddings-trainer") trainer.add_params({ "iterations" : 5, "negative": 5, "window": 5, "learning_rate": 0.025, "min_learning_rate": 0.0001, "embedding_dims": 32, "subsampling": 1e-3, "normalization": "dlw", }) trainer.attach_input("documents", documents) trainer.attach_input("vocabulary", vocabulary) trainer.attach_output("output", output) return output #--------------------------------------------------------------------------- # Fact and category embeddings #--------------------------------------------------------------------------- def fact_dir(self): return flags.arg.workdir + "/fact" def fact_lexicon(self): """Resource for fact vocabulary (text map with fact paths and counts.""" return self.wf.resource("facts.map", dir=self.fact_dir(), format="textmap/fact") def category_lexicon(self): """Resource for category vocabulary (text map with categories and counts.""" return self.wf.resource("categories.map", dir=self.fact_dir(), format="textmap/category") def facts(self): """Resource for resolved facts.""" return self.wf.resource("facts.rec", dir=self.fact_dir(), format="records/fact") def fact_embeddings(self): """Resource for fact embeddings in word2vec embedding format.""" return self.wf.resource("fact-embeddings.vec", dir=self.fact_dir(), format="embeddings") def category_embeddings(self): """Resource for category embeddings in word2vec embedding format.""" return self.wf.resource("category-embeddings.vec", dir=self.fact_dir(), format="embeddings") def extract_fact_lexicon(self): """Build fact and category lexicons.""" kb = self.wiki.knowledge_base() factmap = self.fact_lexicon() catmap = self.category_lexicon() with self.wf.namespace("fact-embeddings"): trainer = self.wf.task("fact-lexicon-extractor") trainer.attach_input("kb", kb) trainer.attach_output("factmap", factmap) trainer.attach_output("catmap", catmap) return factmap, catmap def extract_facts(self): """Extract facts for items in the knowledge base.""" kb = self.wiki.knowledge_base() factmap = self.fact_lexicon() catmap = self.category_lexicon() output = self.facts() with self.wf.namespace("fact-embeddings"): extractor = self.wf.task("fact-extractor") extractor.attach_input("kb", kb) extractor.attach_input("factmap", factmap) extractor.attach_input("catmap", catmap) facts = self.wf.channel(extractor, format="message/frame") return self.wf.write(facts, output, name="fact-writer") def train_fact_embeddings(self): """Train fact and category embeddings.""" facts = self.facts() factmap = self.fact_lexicon() catmap = self.category_lexicon() fact_embeddings = self.fact_embeddings() category_embeddings = self.category_embeddings() with self.wf.namespace("fact-embeddings"): trainer = self.wf.task("fact-embeddings-trainer") trainer.add_params({ "batch_size": 256, "batches_per_update": 32, "embedding_dims": 256, "normalize": False, "epochs" : 100000, "report_interval": 250, "learning_rate": 1.0, "learning_rate_decay": 0.95, "rampup": 120, "clipping": 1, "optimizer": "sgd", }) self.wf.connect(self.wf.read(facts, name="fact-reader"), trainer) trainer.attach_input("factmap", factmap) trainer.attach_input("catmap", catmap) trainer.attach_output("factvecs", fact_embeddings) trainer.attach_output("catvecs", category_embeddings) return fact_embeddings, category_embeddings def fact_plausibility_model(self): """Resource for fact plausibility model.""" return self.wf.resource("plausibility.flow", dir=self.fact_dir(), format="flow") def train_fact_plausibility(self): """Train fact plausibility model.""" facts = self.facts() factmap = self.fact_lexicon() model = self.fact_plausibility_model(); with self.wf.namespace("fact-plausibility"): trainer = self.wf.task("fact-plausibility-trainer") trainer.add_params({ "batch_size": 4, "batches_per_update": 256, "min_facts": 4, "embedding_dims": 128, "epochs" : 250000, "report_interval": 1000, "checkpoint_interval": 50000, "learning_rate": 1.0, "min_learning_rate": 0.001, "learning_rate_decay": 0.95, "clipping": 1, "optimizer": "sgd", "rampup": 5 * 60, }) self.wf.connect(self.wf.read(facts, name="fact-reader"), trainer) trainer.attach_input("factmap", factmap) trainer.attach_output("model", model) return model
class EntityWorkflow: def __init__(self, name=None, wf=None): if wf == None: wf = Workflow(name) self.wf = wf self.wiki = WikiWorkflow(wf=wf) def workdir(self, language=None): if language == None: return flags.arg.workdir + "/ner" else: return flags.arg.workdir + "/ner/" + language #--------------------------------------------------------------------------- # Wikipedia link graph #--------------------------------------------------------------------------- def wikilinks(self): """Resource for wikipedia link graph.""" return self.wf.resource("*****@*****.**", dir=self.workdir(), format="records/frame") def fanin(self): """Resource for wikipedia link fan-in.""" return self.wf.resource("fanin.rec", dir=self.workdir(), format="records/frame") def extract_wikilinks(self): # Build link graph over all Wikipedias. documents = [] for l in flags.arg.languages: documents.extend(self.wiki.wikipedia_documents(l)) # Extract links from documents. mapper = self.wf.task("wikipedia-link-extractor") self.wf.connect(self.wf.read(documents), mapper) links = self.wf.channel(mapper, format="message/frame", name="output") counts = self.wf.channel(mapper, format="message/int", name="fanin") # Reduce output links. wikilinks = self.wikilinks() self.wf.reduce(self.wf.shuffle(links, shards=length_of(wikilinks)), wikilinks, "wikipedia-link-merger") # Reduce fan-in. fanin = self.fanin() self.wf.reduce(self.wf.shuffle(counts, shards=length_of(fanin)), fanin, "item-popularity-reducer") return wikilinks, fanin #--------------------------------------------------------------------------- # IDF table #--------------------------------------------------------------------------- def idftable(self, language=None): """Resource for IDF table.""" if language == None: language = flags.arg.language return self.wf.resource("idf.repo", dir=self.workdir(language), format="repository") def build_idf(self, language=None): # Build IDF table from Wikipedia. if language == None: language = flags.arg.language documents = self.wiki.wikipedia_documents(language) with self.wf.namespace(language + "-idf"): # Collect words. wordcounts = self.wf.shuffle( self.wf.map(documents, "vocabulary-mapper", format="message/count", params={ "min_document_length": 200, "only_lowercase": True })) # Build IDF table. builder = self.wf.task("idf-table-builder", params={"threshold": 30}) self.wf.connect(wordcounts, builder) builder.attach_output("repository", self.idftable(language)) #--------------------------------------------------------------------------- # Fused items #--------------------------------------------------------------------------- def fused_items(self): """Resource for merged items for NER.""" return self.wf.resource("*****@*****.**", dir=self.workdir(), format="records/frame") def fuse_items(self): """Fuse items including the link graph.""" return self.wiki.fuse_items(extras=self.wikilinks() + [self.fanin()], output=self.fused_items()) #--------------------------------------------------------------------------- # Knowledge base #--------------------------------------------------------------------------- def knowledge_base(self): """Resource for NER knowledge base.""" return self.wf.resource("kb.sling", dir=self.workdir(), format="store/frame") def build_knowledge_base(self): """Build knowledge base for NER.""" items = self.fused_items() properties = self.wiki.wikidata_properties() schemas = self.wiki.schema_defs() with self.wf.namespace("ner-kb"): # Prune information from Wikidata items. pruned_items = self.wf.map(items, "wikidata-pruner", params={ "prune_aliases": True, "prune_wiki_links": False, "prune_category_members": True }) # Collect property catalog. property_catalog = self.wf.map(properties, "wikidata-property-collector") # Collect frames into knowledge base store. parts = self.wf.collect(pruned_items, property_catalog, schemas) return self.wf.write(parts, self.knowledge_base(), params={"snapshot": True}) #--------------------------------------------------------------------------- # Labeled documents #--------------------------------------------------------------------------- def labeled_documents(self, language=None): """Resource for labeled documents.""" if language == None: language = flags.arg.language return self.wf.resource("*****@*****.**", dir=self.workdir(language), format="records/document") def label_documents(self, indocs=None, outdocs=None, language=None): if indocs == None: indocs = self.wiki.wikipedia_documents(language) if outdocs == None: indocs = self.labeled_documents(language) if language == None: language = flags.arg.language with self.wf.namespace(language + "-ner"): mapper = self.wf.task("document-ner-labeler", "labeler") mapper.add_param("resolve", True) mapper.attach_input("commons", self.knowledge_base()) mapper.attach_input("aliases", self.wiki.phrase_table(language)) mapper.attach_input("dictionary", self.idftable(language)) self.wf.connect(self.wf.read(indocs), mapper) output = self.wf.channel(mapper, format="message/document") return self.wf.write(output, outdocs)
class SilverWorkflow: def __init__(self, name=None, wf=None): if wf == None: wf = Workflow(name) self.wf = wf self.wiki = WikiWorkflow(wf=wf) def workdir(self, language=None): if language == None: return flags.arg.workdir + "/silver" else: return flags.arg.workdir + "/silver/" + language #--------------------------------------------------------------------------- # IDF table #--------------------------------------------------------------------------- def idftable(self, language=None): """Resource for IDF table.""" if language == None: language = flags.arg.language return self.wf.resource("idf.repo", dir=self.workdir(language), format="repository") def build_idf(self, language=None): # Build IDF table from Wikipedia. if language == None: language = flags.arg.language documents = self.wiki.wikipedia_documents(language) with self.wf.namespace(language + "-idf"): # Collect words. wordcounts = self.wf.shuffle( self.wf.map(documents, "vocabulary-mapper", format="message/count", params={ "min_document_length": 200, "only_lowercase": True })) # Build IDF table. builder = self.wf.task("idf-table-builder", params={"threshold": 30}) self.wf.connect(wordcounts, builder) builder.attach_output("repository", self.idftable(language)) #--------------------------------------------------------------------------- # Silver-labeled documents #--------------------------------------------------------------------------- def silver_documents(self, language=None): """Resource for silver-labeled documents.""" if language == None: language = flags.arg.language return self.wf.resource("*****@*****.**", dir=self.workdir(language), format="records/document") def silver_annotation(self, indocs=None, outdocs=None, language=None): if indocs == None: indocs = self.wiki.wikipedia_documents(language) if outdocs == None: outdocs = self.silver_documents(language) if language == None: language = flags.arg.language phrases = corpora.repository("data/wiki/" + language) + "/phrases.txt" with self.wf.namespace(language + "-silver"): mapper = self.wf.task("document-processor", "labeler") mapper.add_annotator("mentions") mapper.add_annotator("anaphora") mapper.add_annotator("phrase-structure") mapper.add_annotator("relations") mapper.add_param("resolve", True) mapper.add_param("language", language) mapper.attach_input("commons", self.wiki.knowledge_base()) mapper.attach_input("aliases", self.wiki.phrase_table(language)) mapper.attach_input("dictionary", self.idftable(language)) if os.path.isfile(phrases): mapper.attach_input("phrases", self.wf.resource(phrases, format="lex")) self.wf.connect(self.wf.read(indocs), mapper) output = self.wf.channel(mapper, format="message/document") return self.wf.write(output, outdocs)
class SilverWorkflow: def __init__(self, name=None, wf=None): if wf == None: wf = Workflow(name) self.wf = wf self.wiki = WikiWorkflow(wf=wf) def workdir(self, language=None): if language == None: return flags.arg.workdir + "/silver" else: return flags.arg.workdir + "/silver/" + language #--------------------------------------------------------------------------- # IDF table #--------------------------------------------------------------------------- def idftable(self, language=None): """Resource for IDF table.""" if language == None: language = flags.arg.language return self.wf.resource("idf.repo", dir=self.workdir(language), format="repository") def build_idf(self, language=None): # Build IDF table from Wikipedia. if language == None: language = flags.arg.language documents = self.wiki.wikipedia_documents(language) with self.wf.namespace(language + "-idf"): # Collect words. wordcounts = self.wf.shuffle( self.wf.map(documents, "vocabulary-mapper", format="message/count", params={ "min_document_length": 200, "only_lowercase": True })) # Build IDF table. builder = self.wf.task("idf-table-builder", params={"threshold": 30}) self.wf.connect(wordcounts, builder) builder.attach_output("repository", self.idftable(language)) #--------------------------------------------------------------------------- # Silver-labeled training and evaluation documents #--------------------------------------------------------------------------- def training_documents(self, language=None): """Resource for silver-labeled training documents.""" if language == None: language = flags.arg.language return self.wf.resource("*****@*****.**", dir=self.workdir(language), format="records/document") def evaluation_documents(self, language=None): """Resource for silver-labeled evaluation documents.""" if language == None: language = flags.arg.language return self.wf.resource("eval.rec", dir=self.workdir(language), format="records/document") def silver_annotation(self, docs=None, language=None): if language == None: language = flags.arg.language if docs == None: docs = self.wiki.wikipedia_documents(language) train_docs = self.training_documents(language) eval_docs = self.evaluation_documents(language) phrases = corpora.repository("data/wiki/" + language) + "/phrases.txt" split_ratio = 5000 if flags.arg.silver_corpus_size: split_ratio = int(flags.arg.silver_corpus_size / 100) with self.wf.namespace(language + "-silver"): # Map document through silver annotation pipeline and split corpus. mapper = self.wf.task("corpus-split", "labeler") mapper.add_annotator("mentions") mapper.add_annotator("anaphora") #mapper.add_annotator("phrase-structure") mapper.add_annotator("relations") mapper.add_annotator("types") mapper.add_annotator("clear-references") mapper.add_param("resolve", True) mapper.add_param("language", language) mapper.add_param("initial_reference", False) mapper.add_param("definite_reference", False) mapper.add_param("split_ratio", split_ratio) mapper.attach_input("commons", self.wiki.knowledge_base()) mapper.attach_input("aliases", self.wiki.phrase_table(language)) mapper.attach_input("dictionary", self.idftable(language)) config = corpora.repository("data/wiki/" + language + "/silver.sling") if os.path.isfile(config): mapper.attach_input( "commons", self.wf.resource(config, format="store/frame")) reader_params = None if flags.arg.silver_corpus_size: reader_params = { "limit": int(flags.arg.silver_corpus_size / length_of(docs)) } self.wf.connect(self.wf.read(docs, params=reader_params), mapper, name="docs") train_channel = self.wf.channel(mapper, name="train", format="message/document") eval_channel = self.wf.channel(mapper, name="eval", format="message/document") # Write shuffled training documents. train_shards = length_of(train_docs) train_shuffled = self.wf.shuffle(train_channel, shards=train_shards, bufsize=256 * 1024 * 1024) self.wf.write(train_shuffled, train_docs, name="train") # Write evaluation documents. self.wf.write(eval_channel, eval_docs, name="eval") return train_docs, eval_docs #--------------------------------------------------------------------------- # Vocabulary #--------------------------------------------------------------------------- def vocabulary(self, language=None): """Resource for word vocabulary. This is a text map with (normalized) words and counts. """ if language == None: language = flags.arg.language return self.wf.resource("vocabulary.map", dir=self.workdir(language), format="textmap/word") def subwords(self, language=None): """Resource for subword vocabulary. This is a text map with (normalized) subwords and counts. """ if language == None: language = flags.arg.language return self.wf.resource("subwords.map", dir=self.workdir(language), format="textmap/subword") def extract_vocabulary(self, documents=None, output=None, language=None): if language == None: language = flags.arg.language if documents == None: documents = self.training_documents(language) if output == None: output = self.vocabulary(language) with self.wf.namespace(language + "-vocabulary"): # Extract words from documents. words = self.wf.shuffle( self.wf.map(documents, "word-vocabulary-mapper", format="message/word:count", params={ "normalization": "l", "skip_section_titles": True, })) # Build vocabulary from words in documents. vocab = self.wf.reduce(words, output, "word-vocabulary-reducer") vocab.add_param("min_freq", 100) vocab.add_param("max_words", 100000) # Also produce subword vocabulary if requested if flags.arg.subwords: vocab.add_param("max_subwords", 50000) subwords = self.wf.channel(vocab, name="subwords", format="message/word") self.wf.write(subwords, self.subwords(language)) return output #--------------------------------------------------------------------------- # Parser training #--------------------------------------------------------------------------- def parser_model(self, arch, language=None): """Resource for parser model.""" if language == None: language = flags.arg.language return self.wf.resource(arch + ".flow", dir=self.workdir(language), format="flow") def train_parser(self, language=None): if language == None: language = flags.arg.language with self.wf.namespace(language + "-parser"): # Parser trainer task. params = { "normalization": "l", "rnn_type": 1, "rnn_dim": 128, "rnn_highways": True, "rnn_layers": 1, "rnn_bidir": True, "dropout": 0.2, "skip_section_titles": True, "learning_rate": 0.5, "learning_rate_decay": 0.9, "clipping": 1, "local_clipping": True, "optimizer": "sgd", "batch_size": 16, "warmup": 20 * 60, "rampup": 5 * 60, "report_interval": 1000, "learning_rate_cliff": 90000, "epochs": 100000, "checkpoint_interval": 10000, } if flags.arg.subwords: params["encoder"] = "subrnn" params["subword_dim"] = 64 else: params["encoder"] = "lexrnn" params["word_dim"] = 64 if flags.arg.decoder == "knolex": params["decoder"] = "knolex" params["link_dim_token"] = 64 params["ff_l2reg"] = 0.0001 elif flags.arg.decoder == "bio": params["decoder"] = "bio" params["ff_dims"] = [128] elif flags.arg.decoder == "crf": params["decoder"] = "bio" params["crf"] = True params["ff_dims"] = [128] elif flags.arg.decoder == "biaffine": params["decoder"] = "biaffine" params["ff_dims"] = [64] params["ff_dropout"] = 0.2 else: params["decoder"] = flags.arg.decoder trainer = self.wf.task("parser-trainer", params=params) # Inputs. if flags.arg.simple_types: kb = self.wf.resource("data/dev/types.sling", format="store/frame") else: kb = self.wiki.knowledge_base() trainer.attach_input("commons", kb) trainer.attach_input("training_corpus", self.training_documents(language)) trainer.attach_input("evaluation_corpus", self.evaluation_documents(language)) trainer.attach_input("vocabulary", self.vocabulary(language)) if flags.arg.subwords: trainer.attach_input("subwords", self.subwords(language)) # Parser model. model = self.parser_model(flags.arg.decoder, language) trainer.attach_output("model", model) return model