Ejemplo n.º 1
0
 def __init__(self, name=None, wf=None):
   if wf == None: wf = Workflow(name)
   self.wf = wf
   self.wiki = WikiWorkflow(wf=wf)
Ejemplo n.º 2
0
class EmbeddingWorkflow:
  def __init__(self, name=None, wf=None):
    if wf == None: wf = Workflow(name)
    self.wf = wf
    self.wiki = WikiWorkflow(wf=wf)

  #---------------------------------------------------------------------------
  # Word embeddings
  #---------------------------------------------------------------------------

  def vocabulary(self, language=None):
    """Resource for word embedding vocabulary. This is a text map with
    (normalized) words and counts.
    """
    if language == None: language = flags.arg.language
    return self.wf.resource("word-vocabulary.map",
                            dir=corpora.wikidir(language),
                            format="textmap/word")

  def word_embeddings(self, language=None):
    """Resource for word embeddings in word2vec embedding format."""
    if language == None: language = flags.arg.language
    return self.wf.resource("word-embeddings.vec",
                            dir=corpora.wikidir(language),
                            format="embeddings")

  def extract_vocabulary(self, documents=None, output=None, language=None):
    if language == None: language = flags.arg.language
    if documents == None: documents = self.wiki.wikipedia_documents(language)
    if output == None: output = self.vocabulary(language)

    with self.wf.namespace(language + "-vocabulary"):
      return self.wf.mapreduce(documents, output,
                               format="message/word:count",
                               mapper="word-embeddings-vocabulary-mapper",
                               reducer="word-embeddings-vocabulary-reducer",
                               params={"normalization": "dlw"})

  def train_word_embeddings(self, documents=None, vocabulary=None, output=None,
                            language=None):
    """Train word embeddings."""
    if language == None: language = flags.arg.language
    if documents == None: documents = self.wiki.wikipedia_documents(language)
    if vocabulary == None: vocabulary = self.vocabulary(language)
    if output == None: output = self.word_embeddings(language)

    with self.wf.namespace(language + "-word-embeddings"):
      trainer = self.wf.task("word-embeddings-trainer")
      trainer.add_params({
        "iterations" : 5,
        "negative": 5,
        "window": 5,
        "learning_rate": 0.025,
        "min_learning_rate": 0.0001,
        "embedding_dims": 32,
        "subsampling": 1e-3,
        "normalization": "dlw",
      })
      trainer.attach_input("documents", documents)
      trainer.attach_input("vocabulary", vocabulary)
      trainer.attach_output("output", output)
      return output

  #---------------------------------------------------------------------------
  # Fact and category embeddings
  #---------------------------------------------------------------------------

  def fact_dir(self):
    return flags.arg.workdir + "/fact"

  def fact_lexicon(self):
    """Resource for fact vocabulary (text map with fact paths and counts."""
    return self.wf.resource("facts.map",
                            dir=self.fact_dir(),
                            format="textmap/fact")
  def category_lexicon(self):
    """Resource for category vocabulary (text map with categories and counts."""
    return self.wf.resource("categories.map",
                            dir=self.fact_dir(),
                            format="textmap/category")

  def facts(self):
    """Resource for resolved facts."""
    return self.wf.resource("facts.rec",
                            dir=self.fact_dir(),
                            format="records/fact")

  def fact_embeddings(self):
    """Resource for fact embeddings in word2vec embedding format."""
    return self.wf.resource("fact-embeddings.vec",
                            dir=self.fact_dir(),
                            format="embeddings")

  def category_embeddings(self):
    """Resource for category embeddings in word2vec embedding format."""
    return self.wf.resource("category-embeddings.vec",
                            dir=self.fact_dir(),
                            format="embeddings")

  def extract_fact_lexicon(self):
    """Build fact and category lexicons."""
    kb = self.wiki.knowledge_base()
    factmap = self.fact_lexicon()
    catmap = self.category_lexicon()
    with self.wf.namespace("fact-embeddings"):
      trainer = self.wf.task("fact-lexicon-extractor")
      trainer.attach_input("kb", kb)
      trainer.attach_output("factmap", factmap)
      trainer.attach_output("catmap", catmap)
      return factmap, catmap

  def extract_facts(self):
    """Extract facts for items in the knowledge base."""
    kb = self.wiki.knowledge_base()
    factmap = self.fact_lexicon()
    catmap = self.category_lexicon()
    output = self.facts()
    with self.wf.namespace("fact-embeddings"):
      extractor = self.wf.task("fact-extractor")
      extractor.attach_input("kb", kb)
      extractor.attach_input("factmap", factmap)
      extractor.attach_input("catmap", catmap)
      facts = self.wf.channel(extractor, format="message/frame")
      return self.wf.write(facts, output, name="fact-writer")

  def train_fact_embeddings(self):
    """Train fact and category embeddings."""
    facts = self.facts()
    factmap = self.fact_lexicon()
    catmap = self.category_lexicon()
    fact_embeddings = self.fact_embeddings()
    category_embeddings = self.category_embeddings()
    with self.wf.namespace("fact-embeddings"):
      trainer = self.wf.task("fact-embeddings-trainer")
      trainer.add_params({
        "batch_size": 256,
        "batches_per_update": 32,
        "embedding_dims": 256,
        "normalize": False,
        "epochs" : 100000,
        "report_interval": 250,
        "learning_rate": 1.0,
        "learning_rate_decay": 0.95,
        "rampup": 120,
        "clipping": 1,
        "optimizer": "sgd",
      })
      self.wf.connect(self.wf.read(facts, name="fact-reader"), trainer)
      trainer.attach_input("factmap", factmap)
      trainer.attach_input("catmap", catmap)
      trainer.attach_output("factvecs", fact_embeddings)
      trainer.attach_output("catvecs", category_embeddings)
    return fact_embeddings, category_embeddings

  def fact_plausibility_model(self):
    """Resource for fact plausibility model."""
    return self.wf.resource("plausibility.flow",
                            dir=self.fact_dir(),
                            format="flow")

  def train_fact_plausibility(self):
    """Train fact plausibility model."""
    facts = self.facts()
    factmap = self.fact_lexicon()
    model = self.fact_plausibility_model();
    with self.wf.namespace("fact-plausibility"):
      trainer = self.wf.task("fact-plausibility-trainer")
      trainer.add_params({
        "batch_size": 4,
        "batches_per_update": 256,
        "min_facts": 4,
        "embedding_dims": 128,
        "epochs" : 250000,
        "report_interval": 1000,
        "checkpoint_interval": 50000,
        "learning_rate": 1.0,
        "min_learning_rate": 0.001,
        "learning_rate_decay": 0.95,
        "clipping": 1,
        "optimizer": "sgd",
        "rampup": 5 * 60,
      })
      self.wf.connect(self.wf.read(facts, name="fact-reader"), trainer)
      trainer.attach_input("factmap", factmap)
      trainer.attach_output("model", model)
    return model
Ejemplo n.º 3
0
class EntityWorkflow:
    def __init__(self, name=None, wf=None):
        if wf == None: wf = Workflow(name)
        self.wf = wf
        self.wiki = WikiWorkflow(wf=wf)

    def workdir(self, language=None):
        if language == None:
            return flags.arg.workdir + "/ner"
        else:
            return flags.arg.workdir + "/ner/" + language

    #---------------------------------------------------------------------------
    # Wikipedia link graph
    #---------------------------------------------------------------------------

    def wikilinks(self):
        """Resource for wikipedia link graph."""
        return self.wf.resource("*****@*****.**",
                                dir=self.workdir(),
                                format="records/frame")

    def fanin(self):
        """Resource for wikipedia link fan-in."""
        return self.wf.resource("fanin.rec",
                                dir=self.workdir(),
                                format="records/frame")

    def extract_wikilinks(self):
        # Build link graph over all Wikipedias.
        documents = []
        for l in flags.arg.languages:
            documents.extend(self.wiki.wikipedia_documents(l))

        # Extract links from documents.
        mapper = self.wf.task("wikipedia-link-extractor")
        self.wf.connect(self.wf.read(documents), mapper)
        links = self.wf.channel(mapper, format="message/frame", name="output")
        counts = self.wf.channel(mapper, format="message/int", name="fanin")

        # Reduce output links.
        wikilinks = self.wikilinks()
        self.wf.reduce(self.wf.shuffle(links, shards=length_of(wikilinks)),
                       wikilinks, "wikipedia-link-merger")

        # Reduce fan-in.
        fanin = self.fanin()
        self.wf.reduce(self.wf.shuffle(counts, shards=length_of(fanin)), fanin,
                       "item-popularity-reducer")

        return wikilinks, fanin

    #---------------------------------------------------------------------------
    # IDF table
    #---------------------------------------------------------------------------

    def idftable(self, language=None):
        """Resource for IDF table."""
        if language == None: language = flags.arg.language
        return self.wf.resource("idf.repo",
                                dir=self.workdir(language),
                                format="repository")

    def build_idf(self, language=None):
        # Build IDF table from Wikipedia.
        if language == None: language = flags.arg.language
        documents = self.wiki.wikipedia_documents(language)

        with self.wf.namespace(language + "-idf"):
            # Collect words.
            wordcounts = self.wf.shuffle(
                self.wf.map(documents,
                            "vocabulary-mapper",
                            format="message/count",
                            params={
                                "min_document_length": 200,
                                "only_lowercase": True
                            }))

            # Build IDF table.
            builder = self.wf.task("idf-table-builder",
                                   params={"threshold": 30})
            self.wf.connect(wordcounts, builder)
            builder.attach_output("repository", self.idftable(language))

    #---------------------------------------------------------------------------
    # Fused items
    #---------------------------------------------------------------------------

    def fused_items(self):
        """Resource for merged items for NER."""
        return self.wf.resource("*****@*****.**",
                                dir=self.workdir(),
                                format="records/frame")

    def fuse_items(self):
        """Fuse items including the link graph."""
        return self.wiki.fuse_items(extras=self.wikilinks() + [self.fanin()],
                                    output=self.fused_items())

    #---------------------------------------------------------------------------
    # Knowledge base
    #---------------------------------------------------------------------------

    def knowledge_base(self):
        """Resource for NER knowledge base."""
        return self.wf.resource("kb.sling",
                                dir=self.workdir(),
                                format="store/frame")

    def build_knowledge_base(self):
        """Build knowledge base for NER."""
        items = self.fused_items()
        properties = self.wiki.wikidata_properties()
        schemas = self.wiki.schema_defs()

        with self.wf.namespace("ner-kb"):
            # Prune information from Wikidata items.
            pruned_items = self.wf.map(items,
                                       "wikidata-pruner",
                                       params={
                                           "prune_aliases": True,
                                           "prune_wiki_links": False,
                                           "prune_category_members": True
                                       })

            # Collect property catalog.
            property_catalog = self.wf.map(properties,
                                           "wikidata-property-collector")

            # Collect frames into knowledge base store.
            parts = self.wf.collect(pruned_items, property_catalog, schemas)
            return self.wf.write(parts,
                                 self.knowledge_base(),
                                 params={"snapshot": True})

    #---------------------------------------------------------------------------
    # Labeled documents
    #---------------------------------------------------------------------------

    def labeled_documents(self, language=None):
        """Resource for labeled documents."""
        if language == None: language = flags.arg.language
        return self.wf.resource("*****@*****.**",
                                dir=self.workdir(language),
                                format="records/document")

    def label_documents(self, indocs=None, outdocs=None, language=None):
        if indocs == None: indocs = self.wiki.wikipedia_documents(language)
        if outdocs == None: indocs = self.labeled_documents(language)
        if language == None: language = flags.arg.language

        with self.wf.namespace(language + "-ner"):
            mapper = self.wf.task("document-ner-labeler", "labeler")
            mapper.add_param("resolve", True)
            mapper.attach_input("commons", self.knowledge_base())
            mapper.attach_input("aliases", self.wiki.phrase_table(language))
            mapper.attach_input("dictionary", self.idftable(language))

            self.wf.connect(self.wf.read(indocs), mapper)
            output = self.wf.channel(mapper, format="message/document")
            return self.wf.write(output, outdocs)
Ejemplo n.º 4
0
class SilverWorkflow:
    def __init__(self, name=None, wf=None):
        if wf == None: wf = Workflow(name)
        self.wf = wf
        self.wiki = WikiWorkflow(wf=wf)

    def workdir(self, language=None):
        if language == None:
            return flags.arg.workdir + "/silver"
        else:
            return flags.arg.workdir + "/silver/" + language

    #---------------------------------------------------------------------------
    # IDF table
    #---------------------------------------------------------------------------

    def idftable(self, language=None):
        """Resource for IDF table."""
        if language == None: language = flags.arg.language
        return self.wf.resource("idf.repo",
                                dir=self.workdir(language),
                                format="repository")

    def build_idf(self, language=None):
        # Build IDF table from Wikipedia.
        if language == None: language = flags.arg.language
        documents = self.wiki.wikipedia_documents(language)

        with self.wf.namespace(language + "-idf"):
            # Collect words.
            wordcounts = self.wf.shuffle(
                self.wf.map(documents,
                            "vocabulary-mapper",
                            format="message/count",
                            params={
                                "min_document_length": 200,
                                "only_lowercase": True
                            }))

            # Build IDF table.
            builder = self.wf.task("idf-table-builder",
                                   params={"threshold": 30})
            self.wf.connect(wordcounts, builder)
            builder.attach_output("repository", self.idftable(language))

    #---------------------------------------------------------------------------
    # Silver-labeled documents
    #---------------------------------------------------------------------------

    def silver_documents(self, language=None):
        """Resource for silver-labeled documents."""
        if language == None: language = flags.arg.language
        return self.wf.resource("*****@*****.**",
                                dir=self.workdir(language),
                                format="records/document")

    def silver_annotation(self, indocs=None, outdocs=None, language=None):
        if indocs == None: indocs = self.wiki.wikipedia_documents(language)
        if outdocs == None: outdocs = self.silver_documents(language)
        if language == None: language = flags.arg.language
        phrases = corpora.repository("data/wiki/" + language) + "/phrases.txt"

        with self.wf.namespace(language + "-silver"):
            mapper = self.wf.task("document-processor", "labeler")

            mapper.add_annotator("mentions")
            mapper.add_annotator("anaphora")
            mapper.add_annotator("phrase-structure")
            mapper.add_annotator("relations")

            mapper.add_param("resolve", True)
            mapper.add_param("language", language)
            mapper.attach_input("commons", self.wiki.knowledge_base())
            mapper.attach_input("aliases", self.wiki.phrase_table(language))
            mapper.attach_input("dictionary", self.idftable(language))
            if os.path.isfile(phrases):
                mapper.attach_input("phrases",
                                    self.wf.resource(phrases, format="lex"))

            self.wf.connect(self.wf.read(indocs), mapper)
            output = self.wf.channel(mapper, format="message/document")
            return self.wf.write(output, outdocs)
Ejemplo n.º 5
0
class SilverWorkflow:
    def __init__(self, name=None, wf=None):
        if wf == None: wf = Workflow(name)
        self.wf = wf
        self.wiki = WikiWorkflow(wf=wf)

    def workdir(self, language=None):
        if language == None:
            return flags.arg.workdir + "/silver"
        else:
            return flags.arg.workdir + "/silver/" + language

    #---------------------------------------------------------------------------
    # IDF table
    #---------------------------------------------------------------------------

    def idftable(self, language=None):
        """Resource for IDF table."""
        if language == None: language = flags.arg.language
        return self.wf.resource("idf.repo",
                                dir=self.workdir(language),
                                format="repository")

    def build_idf(self, language=None):
        # Build IDF table from Wikipedia.
        if language == None: language = flags.arg.language
        documents = self.wiki.wikipedia_documents(language)

        with self.wf.namespace(language + "-idf"):
            # Collect words.
            wordcounts = self.wf.shuffle(
                self.wf.map(documents,
                            "vocabulary-mapper",
                            format="message/count",
                            params={
                                "min_document_length": 200,
                                "only_lowercase": True
                            }))

            # Build IDF table.
            builder = self.wf.task("idf-table-builder",
                                   params={"threshold": 30})
            self.wf.connect(wordcounts, builder)
            builder.attach_output("repository", self.idftable(language))

    #---------------------------------------------------------------------------
    # Silver-labeled training and evaluation documents
    #---------------------------------------------------------------------------

    def training_documents(self, language=None):
        """Resource for silver-labeled training documents."""
        if language == None: language = flags.arg.language
        return self.wf.resource("*****@*****.**",
                                dir=self.workdir(language),
                                format="records/document")

    def evaluation_documents(self, language=None):
        """Resource for silver-labeled evaluation documents."""
        if language == None: language = flags.arg.language
        return self.wf.resource("eval.rec",
                                dir=self.workdir(language),
                                format="records/document")

    def silver_annotation(self, docs=None, language=None):
        if language == None: language = flags.arg.language
        if docs == None: docs = self.wiki.wikipedia_documents(language)
        train_docs = self.training_documents(language)
        eval_docs = self.evaluation_documents(language)
        phrases = corpora.repository("data/wiki/" + language) + "/phrases.txt"

        split_ratio = 5000
        if flags.arg.silver_corpus_size:
            split_ratio = int(flags.arg.silver_corpus_size / 100)

        with self.wf.namespace(language + "-silver"):
            # Map document through silver annotation pipeline and split corpus.
            mapper = self.wf.task("corpus-split", "labeler")

            mapper.add_annotator("mentions")
            mapper.add_annotator("anaphora")
            #mapper.add_annotator("phrase-structure")
            mapper.add_annotator("relations")
            mapper.add_annotator("types")
            mapper.add_annotator("clear-references")

            mapper.add_param("resolve", True)
            mapper.add_param("language", language)
            mapper.add_param("initial_reference", False)
            mapper.add_param("definite_reference", False)
            mapper.add_param("split_ratio", split_ratio)

            mapper.attach_input("commons", self.wiki.knowledge_base())
            mapper.attach_input("aliases", self.wiki.phrase_table(language))
            mapper.attach_input("dictionary", self.idftable(language))

            config = corpora.repository("data/wiki/" + language +
                                        "/silver.sling")
            if os.path.isfile(config):
                mapper.attach_input(
                    "commons", self.wf.resource(config, format="store/frame"))

            reader_params = None
            if flags.arg.silver_corpus_size:
                reader_params = {
                    "limit":
                    int(flags.arg.silver_corpus_size / length_of(docs))
                }

            self.wf.connect(self.wf.read(docs, params=reader_params),
                            mapper,
                            name="docs")

            train_channel = self.wf.channel(mapper,
                                            name="train",
                                            format="message/document")
            eval_channel = self.wf.channel(mapper,
                                           name="eval",
                                           format="message/document")

            # Write shuffled training documents.
            train_shards = length_of(train_docs)
            train_shuffled = self.wf.shuffle(train_channel,
                                             shards=train_shards,
                                             bufsize=256 * 1024 * 1024)
            self.wf.write(train_shuffled, train_docs, name="train")

            # Write evaluation documents.
            self.wf.write(eval_channel, eval_docs, name="eval")

        return train_docs, eval_docs

    #---------------------------------------------------------------------------
    # Vocabulary
    #---------------------------------------------------------------------------

    def vocabulary(self, language=None):
        """Resource for word vocabulary. This is a text map with (normalized) words
    and counts.
    """
        if language == None: language = flags.arg.language
        return self.wf.resource("vocabulary.map",
                                dir=self.workdir(language),
                                format="textmap/word")

    def subwords(self, language=None):
        """Resource for subword vocabulary. This is a text map with (normalized)
    subwords and counts.
    """
        if language == None: language = flags.arg.language
        return self.wf.resource("subwords.map",
                                dir=self.workdir(language),
                                format="textmap/subword")

    def extract_vocabulary(self, documents=None, output=None, language=None):
        if language == None: language = flags.arg.language
        if documents == None: documents = self.training_documents(language)
        if output == None: output = self.vocabulary(language)

        with self.wf.namespace(language + "-vocabulary"):
            # Extract words from documents.
            words = self.wf.shuffle(
                self.wf.map(documents,
                            "word-vocabulary-mapper",
                            format="message/word:count",
                            params={
                                "normalization": "l",
                                "skip_section_titles": True,
                            }))

            # Build vocabulary from words in documents.
            vocab = self.wf.reduce(words, output, "word-vocabulary-reducer")
            vocab.add_param("min_freq", 100)
            vocab.add_param("max_words", 100000)

            # Also produce subword vocabulary if requested
            if flags.arg.subwords:
                vocab.add_param("max_subwords", 50000)
                subwords = self.wf.channel(vocab,
                                           name="subwords",
                                           format="message/word")
                self.wf.write(subwords, self.subwords(language))

        return output

    #---------------------------------------------------------------------------
    # Parser training
    #---------------------------------------------------------------------------

    def parser_model(self, arch, language=None):
        """Resource for parser model."""
        if language == None: language = flags.arg.language
        return self.wf.resource(arch + ".flow",
                                dir=self.workdir(language),
                                format="flow")

    def train_parser(self, language=None):
        if language == None: language = flags.arg.language
        with self.wf.namespace(language + "-parser"):
            # Parser trainer task.
            params = {
                "normalization": "l",
                "rnn_type": 1,
                "rnn_dim": 128,
                "rnn_highways": True,
                "rnn_layers": 1,
                "rnn_bidir": True,
                "dropout": 0.2,
                "skip_section_titles": True,
                "learning_rate": 0.5,
                "learning_rate_decay": 0.9,
                "clipping": 1,
                "local_clipping": True,
                "optimizer": "sgd",
                "batch_size": 16,
                "warmup": 20 * 60,
                "rampup": 5 * 60,
                "report_interval": 1000,
                "learning_rate_cliff": 90000,
                "epochs": 100000,
                "checkpoint_interval": 10000,
            }

            if flags.arg.subwords:
                params["encoder"] = "subrnn"
                params["subword_dim"] = 64
            else:
                params["encoder"] = "lexrnn"
                params["word_dim"] = 64

            if flags.arg.decoder == "knolex":
                params["decoder"] = "knolex"
                params["link_dim_token"] = 64
                params["ff_l2reg"] = 0.0001
            elif flags.arg.decoder == "bio":
                params["decoder"] = "bio"
                params["ff_dims"] = [128]
            elif flags.arg.decoder == "crf":
                params["decoder"] = "bio"
                params["crf"] = True
                params["ff_dims"] = [128]
            elif flags.arg.decoder == "biaffine":
                params["decoder"] = "biaffine"
                params["ff_dims"] = [64]
                params["ff_dropout"] = 0.2
            else:
                params["decoder"] = flags.arg.decoder

            trainer = self.wf.task("parser-trainer", params=params)

            # Inputs.
            if flags.arg.simple_types:
                kb = self.wf.resource("data/dev/types.sling",
                                      format="store/frame")
            else:
                kb = self.wiki.knowledge_base()

            trainer.attach_input("commons", kb)
            trainer.attach_input("training_corpus",
                                 self.training_documents(language))
            trainer.attach_input("evaluation_corpus",
                                 self.evaluation_documents(language))
            trainer.attach_input("vocabulary", self.vocabulary(language))
            if flags.arg.subwords:
                trainer.attach_input("subwords", self.subwords(language))

            # Parser model.
            model = self.parser_model(flags.arg.decoder, language)
            trainer.attach_output("model", model)

        return model