def silver_annotation(self, indocs=None, outdocs=None, language=None): if indocs == None: indocs = self.wiki.wikipedia_documents(language) if outdocs == None: outdocs = self.silver_documents(language) if language == None: language = flags.arg.language phrases = corpora.repository("data/wiki/" + language) + "/phrases.txt" with self.wf.namespace(language + "-silver"): mapper = self.wf.task("document-processor", "labeler") mapper.add_annotator("mentions") mapper.add_annotator("anaphora") mapper.add_annotator("phrase-structure") mapper.add_annotator("relations") mapper.add_param("resolve", True) mapper.add_param("language", language) mapper.attach_input("commons", self.wiki.knowledge_base()) mapper.attach_input("aliases", self.wiki.phrase_table(language)) mapper.attach_input("dictionary", self.idftable(language)) if os.path.isfile(phrases): mapper.attach_input("phrases", self.wf.resource(phrases, format="lex")) self.wf.connect(self.wf.read(indocs), mapper) output = self.wf.channel(mapper, format="message/document") return self.wf.write(output, outdocs)
def template_defs(self, language=None): """Resource for template definitions.""" if language == None: language = flags.arg.language return self.wf.resource("templates.sling", dir=corpora.repository("data/wiki/" + language), format="store/frame")
def alias_corrections(self): """Resource for alias corrections.""" return self.wf.resource("aliases.sling", dir=corpora.repository("data/wiki"), format="store/frame")
def wikipedia_defs(self): """Resource for Wikipedia schema definitions.""" return self.wf.resource("wikipedia.sling", dir=corpora.repository("data/wiki"), format="store/frame")
def unit_defs(self): """Resource for calendar definitions.""" return self.wf.resource("units.sling", dir=corpora.repository("data/wiki"), format="store/frame")
def country_defs(self): """Resource for country definitions.""" return self.wf.resource("countries.sling", dir=corpora.repository("data/wiki"), format="store/frame")
def language_defs(self): """Resource for language definitions. This defines the /lang/<lang> symbols and has meta information for each language.""" return self.wf.resource("languages.sling", dir=corpora.repository("data/wiki"), format="store/frame")
def silver_annotation(self, docs=None, language=None): if language == None: language = flags.arg.language if docs == None: docs = self.data.wikipedia_documents(language) train_docs = self.training_documents(language) eval_docs = self.evaluation_documents(language) phrases = corpora.repository("data/wiki/" + language) + "/phrases.txt" split_ratio = 5000 if flags.arg.silver_corpus_size: split_ratio = int(flags.arg.silver_corpus_size / 100) with self.wf.namespace(language + "-silver"): # Map document through silver annotation pipeline and split corpus. mapper = self.wf.task("corpus-split", "labeler") mapper.add_annotator("mentions") mapper.add_annotator("anaphora") #mapper.add_annotator("phrase-structure") mapper.add_annotator("relations") mapper.add_annotator("types") mapper.add_annotator("clear-references") mapper.add_param("resolve", True) mapper.add_param("language", language) mapper.add_param("initial_reference", False) mapper.add_param("definite_reference", False) mapper.add_param("split_ratio", split_ratio) mapper.attach_input("commons", self.data.knowledge_base()) mapper.attach_input("aliases", self.data.phrase_table(language)) mapper.attach_input("dictionary", self.idftable(language)) config = corpora.repository("data/wiki/" + language + "/silver.sling") if os.path.isfile(config): mapper.attach_input("commons", self.wf.resource(config, format="store/frame")) reader_params = None if flags.arg.silver_corpus_size: reader_params = { "limit": int(flags.arg.silver_corpus_size / length_of(docs)) } self.wf.connect(self.wf.read(docs, params=reader_params), mapper, name="docs") train_channel = self.wf.channel(mapper, name="train", format="message/document") eval_channel = self.wf.channel(mapper, name="eval", format="message/document") # Write shuffled training documents. train_shards = length_of(train_docs) train_shuffled = self.wf.shuffle(train_channel, shards=train_shards, bufsize=256 * 1024 * 1024) self.wf.write(train_shuffled, train_docs, name="train") # Write evaluation documents. self.wf.write(eval_channel, eval_docs, name="eval") return train_docs, eval_docs
def custom_properties(self): """Resource for custom SLING knowledge base properties.""" return self.wf.resource("custom-properties.sling", dir=corpora.repository("data/nlp/schemas"), format="store/frame")
def dataset(self, path): if path.startswith("repo/"): return self.wf.resource(corpora.repository(path[5:]), format="file") else: return self.wf.resource(path, dir=flags.arg.workdir, format="file")
def recon_config(self): """Resource for reconciler configuration.""" return self.wf.resource("recon.sling", dir=corpora.repository("data/wiki"), format="store/frame")
def xref_config(self): """Resource for cross-references configuration.""" return self.wf.resource("xrefs.sling", dir=corpora.repository("data/wiki"), format="store/frame")
def document_schema_defs(self): """Resource for document schema definitions.""" return self.wf.resource("document-schema.sling", dir=corpora.repository("data/nlp/schemas"), format="store/frame")
def catalog_defs(self): """Resource for global catalog definitions.""" return self.wf.resource("catalog.sling", dir=corpora.repository("data/nlp/schemas"), format="store/frame")
def xref_properties(self): """Resource for properties tracked for cross-references.""" return self.wf.resource("xrefs.sling", dir=corpora.repository("data/wiki"), format="store/frame")