Exemple #1
0
 def knowledge_base(self):
     """Resource for knowledge base. This is a SLING frame store with frames for
 each Wikidata item and property plus additional schema information.
 """
     return self.wf.resource("kb.sling",
                             dir=corpora.wikidir(),
                             format="store/frame")
Exemple #2
0
 def wikipedia_items(self):
     """Resource for item data from Wikipedia . This merges the item categories
 from all Wikipedias.
 """
     return self.wf.resource("wikipedia-items.rec",
                             dir=corpora.wikidir(),
                             format="records/frame")
Exemple #3
0
 def fused_items(self):
     """Resource for merged items. This is a set of record files where each
 item is represented as a frame.
 """
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(),
                             format="records/frame")
Exemple #4
0
 def kb_input(self, task, kb_dir=None):
     if kb_dir is None:
         kb_dir = corpora.wikidir()
     kb = self.wf.resource(file="kb.sling",
                           dir=kb_dir,
                           format="store/frame")
     task.attach_input("kb", kb)
Exemple #5
0
    def wikidata_items(self):
        """Resource for wikidata items. This is a set of record files where each
    WikiData item is represented as a frame:
      <qid>: {
        =<qid>
        :/w/item
        name: "..."
        description: "..."
        alias: {
          name: "..."
          lang: /lang/<lang>
          sources: ...
        }
        ...
        /w/wikipedia: {
          /lang/<lang>: <wid>
          ...
       }
       ... properties
      }

      <qid>: Wikidata item id (Q<item number>, e.g. Q35)
      <pid>: Wikidata property id (P<property number>, e.g. P31)
      <wid>: Wikipedia page id (/wp/<lang>/<pageid>, /wp/en/76972)
    """
        return self.wf.resource("*****@*****.**",
                                dir=corpora.wikidir(),
                                format="records/frame")
Exemple #6
0
    def generate_parses(self, language, min_members):
        with self.wf.namespace("generate-parses"):
            generator = self.wf.task("category-parse-generator")
            generator.add_params({
                "language": language,
                "min_members": min_members
            })
            wikidir = corpora.wikidir()
            self.kb_input(generator, kb_dir=wikidir)

            items = self.wf.resource(file="*****@*****.**",
                                     dir=wikidir,
                                     format="records/frame")
            generator.attach_input("items", items)

            phrase_table_dir = wikidir + "/" + language
            phrase_table = self.wf.resource("phrase-table.repo",
                                            dir=phrase_table_dir,
                                            format="text/frame")
            generator.attach_input("phrase-table", phrase_table)

            output = self.generated_parses_resource()
            generator.attach_output("output", output)
            rejected = self.wf.resource("rejected-categories.rec",
                                        dir=self.outdir,
                                        format="records/text")
            generator.attach_output("rejected", rejected)
            return output
Exemple #7
0
 def name_table(self, language=None):
     """Resource for item name table. This is a repository with all the names
 and the items they are aliases for."""
     if language == None: language = flags.arg.language
     return self.wf.resource("name-table.repo",
                             dir=corpora.wikidir(language),
                             format="repository")
Exemple #8
0
 def phrase_table(self, language=None):
     """Resource for item name phrase table. This is a repository with phrase
 fingerprints of the item names."""
     if language == None: language = flags.arg.language
     return self.wf.resource("phrase-table.repo",
                             dir=corpora.wikidir(language),
                             format="repository")
Exemple #9
0
 def wikipedia_category_documents(self, language=None):
     """Resource for parsed Wikipedia category documents.
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(language),
                             format="records/document")
Exemple #10
0
 def wikipedia_documents(self, language=None):
     """Resource for parsed Wikipedia documents. This is a set of record files
 with one record per article, where the text has been extracted from the
 wiki markup and tokenized. The documents also contains additional
 structured information (e.g. categories) and mentions for links to other
 Wikipedia pages:
   <wid>: {
     =<wid>
     :/wp/page
     /wp/page/pageid: ...
     /wp/page/title: "..."
     lang: /lang/<lang>
     /wp/page/text: "<Wikipedia page in wiki markup format>"
     /wp/page/qid: <qid>
     :document
     url: "http://<lang>.wikipedia.org/wiki/<name>"
     title: "..."
     text: "<clear text extracted from wiki markup>"
     tokens: [...]
     mention: {
       :/wp/link
       begin: ...
       length: ...
       evokes: <qid>
     }
     ...
     /wp/page/category: <qid>
     ...
   }
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(language),
                             format="records/document")
Exemple #11
0
 def wikipedia_categories(self, language=None):
     """Resource for wikipedia categories. This is a set of record files where
 each Wikipedia article is encoded as a SLING document.
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(language),
                             format="records/frame")
Exemple #12
0
 def vocabulary(self, language=None):
   """Resource for word embedding vocabulary. This is a text map with
   (normalized) words and counts.
   """
   if language == None: language = flags.arg.language
   return self.wf.resource("word-vocabulary.map",
                           dir=corpora.wikidir(language),
                           format="textmap/word")
Exemple #13
0
 def wikidata_redirects(self):
     """Resource for Wikidata redirects. This is a set of record files where each
 Wikidata redirect is represented as a frame:
   <qid>: {
     =<qid>
     +<redirect>
   }
 """
     return self.wf.resource("wikidata-redirects.rec",
                             dir=corpora.wikidir(),
                             format="records/frame")
Exemple #14
0
 def wikipedia_mapping(self, language=None):
     """Resource for wikipedia to wikidata mapping. This is a SLING frame store
 with one frame per Wikipedia article with infomation for mapping it to
 Wikidata.
   {
     =<wid>
     /w/item/qid: <qid>
     /w/item/kind: /w/item/kind/...
   }
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("mapping.sling",
                             dir=corpora.wikidir(language),
                             format="store/frame")
Exemple #15
0
 def wikipedia_redirects(self, language=None):
     """Resource for wikidata redirects. This is encoded as a SLING frame store
 where each redirect is a SLING frame.
   {
     =<wid for redirect page>
     :/wp/redirect
     /wp/redirect/pageid: ...
     /wp/redirect/title: "..."
     /wp/redirect/link: <wid for target page>
   }
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("redirects.sling",
                             dir=corpora.wikidir(language),
                             format="store/frame")
Exemple #16
0
 def wikipedia_articles(self, language=None):
     """Resource for wikipedia articles. This is a set of record files where each
 Wikipedia article is encoded as a SLING document.
   <wikipedia article title>: {
     :/wp/page
     /wp/page/pageid: ...
     /wp/page/title: "..."
     lang: /lang/<lang>
     /wp/page/text: "<Wikipedia page in Wiki markup format>"
   }
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(language),
                             format="records/frame")
Exemple #17
0
 def wikidata_properties(self):
     """Resource for wikidata properties. This is a record file where each
 Wikidata property is represented as a frame.
   <pid>: {
     =<pid>
     :/w/property
     name: "..."
     description: "..."
     /w/datatype: ...
     ... properties ...
   }
 """
     return self.wf.resource("properties.rec",
                             dir=corpora.wikidir(),
                             format="records/frame")
Exemple #18
0
 def wikipedia_aliases(self, language=None):
     """Resource for wikipedia aliases. The aliases are extracted from the
 Wikipedia pages from anchors, redirects, disambiguation pages etc. This is
 a set of record files with a SLING frame record for each item:
   <qid>: {
     alias: {+"<alias>"@/lang/xx
       sources: ...
       count: ...
     }
     ...
   }
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(language),
                             format="records/alias")
Exemple #19
0
 def item_names(self, language=None):
     """Resource for item names in language. This is a set of record files with
 one SLING frame per item.
   <qid>: {
     alias: {
       name: "<alias>"
       lang: /lang/<lang>
       sources: ...
       count: ...
       form: ...
     }
     ...
   }
 """
     if language == None: language = flags.arg.language
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(language),
                             format="records/alias")
Exemple #20
0
 def word_embeddings(self, language=None):
   """Resource for word embeddings in word2vec embedding format."""
   if language == None: language = flags.arg.language
   return self.wf.resource("word-embeddings.vec",
                           dir=corpora.wikidir(language),
                           format="embeddings")
Exemple #21
0
 def xrefs(self):
   """Resource for store with cross-reference items."""
   return self.wf.resource("xrefs.sling",
                           dir=corpora.wikidir(),
                           format="store/frame")
Exemple #22
0
 def wikidata_latest(self):
     """Resource for latest Wikidata update. This contains the the QID and
 revision of the latest update."""
     return self.wf.resource("latest", dir=corpora.wikidir(), format="text")
Exemple #23
0
 def wikipedia_items(self):
     """Resource for item data from Wikipedias."""
     return self.wf.resource("wikipedia-items.rec",
                             dir=corpora.wikidir(),
                             format="records/frame")
Exemple #24
0
 def fanin(self):
     """Resource for item fan-in, i.e. the number of times an item is a target
 in a relation."""
     return self.wf.resource("fanin.rec",
                             dir=corpora.wikidir(),
                             format="records/frame")
Exemple #25
0
 def item_popularity(self):
     """Resource for item popularity."""
     return self.wf.resource("item-popularity.rec",
                             dir=corpora.wikidir(),
                             format="records/frame")
Exemple #26
0
 def fanin(self):
     """Resource for link fan-in."""
     return self.wf.resource("fanin.rec",
                             dir=corpora.wikidir(),
                             format="records/frame")
Exemple #27
0
 def wikilinks(self):
     """Resource for link graph."""
     return self.wf.resource("*****@*****.**",
                             dir=corpora.wikidir(),
                             format="records/frame")
Exemple #28
0
 def wikipedia_members(self):
     """Resource for members of categories.
 """
     return self.wf.resource("wikipedia-members.rec",
                             dir=corpora.wikidir(),
                             format="records/frame")
Exemple #29
0
 def wikipedia_summaries(self, language=None):
   """Resource for Wikipedia document summaries."""
   return self.wf.resource("summaries.rec",
                           dir=corpora.wikidir(language),
                           format="records/document")