Esempio n. 1
0
 def init(self, parses_filename, output_dir):
     self.output_dir = output_dir
     reader = sling.RecordReader(parses_filename)
     self.category_name_to_qid = {}  # category name -> qid
     self.category_frame = {}  # category qid -> frame
     self.category_parses = {}  # category qid -> parses
     self.signature_to_parse = defaultdict(list)  # signature -> parse
     self.store = sling.Store()
     self.num_parses = 0
     for index, (qid, value) in enumerate(reader):
         if (index + 1) % 20000 == 0:
             log.info("%d categories read" % index)
         qid = qid.decode('utf-8')
         frame = self.store.parse(value)
         self.category_name_to_qid[frame.name] = qid
         self.category_frame[qid] = frame
         self.category_parses[qid] = []
         for parse in frame("parse"):
             element = Parse(self.num_parses, qid, frame, parse)
             signature = util.full_parse_signature(parse)
             self.signature_to_parse[signature].append(element)
             self.category_parses[qid].append(element)
             self.num_parses += 1
     self.store.lockgc()
     self.store.freeze()
     self.store.unlockgc()
Esempio n. 2
0
def build_knowledge_base():
    # Merge categories from wikipedias.
    if flags.arg.merge_categories:
        log.info("Merge wikipedia categories")
        wf = wiki.WikiWorkflow("category-merging")
        wf.merge_wikipedia_categories()
        workflow.run(wf.wf)

    # Invert categories.
    if flags.arg.invert_categories:
        log.info("Invert categories")
        wf = wiki.WikiWorkflow("category-inversion")
        wf.invert_wikipedia_categories()
        workflow.run(wf.wf)

    # Extract link graph.
    if flags.arg.extract_wikilinks:
        log.info("Extract link graph")
        wf = wiki.WikiWorkflow("link-graph")
        wf.extract_links()
        workflow.run(wf.wf)

    # Fuse items.
    if flags.arg.fuse_items:
        log.info("Fuse items")
        wf = wiki.WikiWorkflow("fuse-items")
        wf.fuse_items()
        workflow.run(wf.wf)

    # Build knowledge base repository.
    if flags.arg.build_kb:
        log.info("Build knowledge base repository")
        wf = wiki.WikiWorkflow("knowledge-base")
        wf.build_knowledge_base()
        workflow.run(wf.wf)
Esempio n. 3
0
def load_kb(task):
  kb = sling.Store()
  kb.load(task.input("kb").name)
  log.info("Knowledge base read")
  kb.freeze()
  log.info("Knowledge base frozen")
  return kb
Esempio n. 4
0
def parse_wikipedia():
    # Convert wikipedia pages to SLING documents.
    if flags.arg.parse_wikipedia:
        for language in flags.arg.languages:
            log.info("Parse " + language + " wikipedia")
            wf = wiki.WikiWorkflow(language + "-wikipedia-parsing")
            wf.parse_wikipedia(language=language)
            workflow.run(wf.wf)
Esempio n. 5
0
 def run(self):
   log.info("job queue", self.name, "ready to execute jobs")
   while True:
     job = self.pending.get()
     try:
       self.execute(job)
     except Exception as e:
       log.info("Error executing job", job.id, ":", e)
       traceback.print_exc()
     finally:
       self.pending.task_done()
Esempio n. 6
0
def save_workflow_log(path):
    global active
    if not active: return False
    if path is None or len(path) == 0: return False
    if not os.path.exists(path): return False
    logfn = path + "/" + time.strftime("%Y%m%d-%H%M%S") + ".json"
    logfile = open(logfn, "w")
    logfile.write(statistics())
    logfile.close()
    log.info("workflow stats saved in " + logfn)
    return True
Esempio n. 7
0
def fuse_items():
    # Merge categories from wikipedias.
    if flags.arg.merge_categories:
        log.info("Merge wikipedia categories")
        wf = wiki.WikiWorkflow("category-merging")
        wf.merge_wikipedia_categories()
        workflow.run(wf.wf)

    # Invert categories.
    if flags.arg.invert_categories:
        log.info("Invert categories")
        wf = wiki.WikiWorkflow("category-inversion")
        wf.invert_wikipedia_categories()
        workflow.run(wf.wf)

    # Compute item popularity.
    if flags.arg.compute_item_popularity:
        log.info("Compute item popularity")
        wf = wiki.WikiWorkflow("item-popularity")
        wf.compute_item_popularity()
        workflow.run(wf.wf)

    # Fuse items.
    if flags.arg.fuse_items:
        log.info("Fuse items")
        wf = wiki.WikiWorkflow("fuse-items")
        wf.fuse_items()
        workflow.run(wf.wf)
Esempio n. 8
0
def build_knowledge_base():
    # Build knowledge base repository.
    if flags.arg.build_kb:
        log.info("Build knowledge base repository")
        wf = wiki.WikiWorkflow("knowledge-base")
        wf.build_knowledge_base()
        workflow.run(wf.wf)

    # Extract item names from wikidata and wikipedia.
    if flags.arg.extract_names:
        for language in flags.arg.languages:
            log.info("Extract " + language + " names")
            wf = wiki.WikiWorkflow(language + "-name-extraction")
            wf.extract_names(language=language)
            workflow.run(wf.wf)

    # Build name table.
    if flags.arg.build_nametab:
        for language in flags.arg.languages:
            log.info("Build " + language + " name table")
            wf = wiki.WikiWorkflow(language + "-name-table")
            wf.build_name_table(language=language)
            workflow.run(wf.wf)

    # Build phrase table.
    if flags.arg.build_phrasetab:
        for language in flags.arg.languages:
            log.info("Build " + language + " phrase table")
            wf = wiki.WikiWorkflow(language + "-phrase-table")
            wf.build_phrase_table(language=language)
            workflow.run(wf.wf)
Esempio n. 9
0
def extract_named_entities():
  # Extract Wikipedia link graph.
  if flags.arg.extract_wikilinks:
    log.info("Extract Wikipedia link graph")
    wf = entity.EntityWorkflow("wiki-links")
    wf.extract_wikilinks()
    workflow.run(wf.wf)

  # Extract IDF table.
  if flags.arg.build_idf:
    wf = entity.EntityWorkflow("idf-table")
    for language in flags.arg.languages:
      log.info("Build " + language + " IDF table")
      wf.build_idf(language=language)
    workflow.run(wf.wf)

  # Fuse NER items.
  if flags.arg.fuse_ner_items:
    log.info("Fuse NER items")
    wf = entity.EntityWorkflow("fuse-ner-items")
    wf.fuse_items()
    workflow.run(wf.wf)

  # Build NER knowledge base.
  if flags.arg.build_ner_kb:
    log.info("Build NER knowledge base")
    wf = entity.EntityWorkflow("ner-knowledge-base")
    wf.build_knowledge_base()
    workflow.run(wf.wf)
Esempio n. 10
0
File: run.py Progetto: savkov/sling
def run_workflow(wf):
    # In dryrun mode the workflow is just dumped without running it.
    if flags.arg.dryrun:
        print wf.wf.dump()
        return

    # Start workflow.
    log.info("start workflow")
    wf.wf.start()

    # Wait until workflow completes. Poll every second to make the workflow
    # interruptible.
    done = False
    while not done:
        done = wf.wf.wait(1000)
Esempio n. 11
0
def import_wiki():
    if flags.arg.import_wikidata or flags.arg.import_wikipedia:
        wf = wiki.WikiWorkflow("wiki-import")
        # Import wikidata.
        if flags.arg.import_wikidata:
            log.info("Import wikidata")
            wf.wikidata()

        # Import wikipedia(s).
        if flags.arg.import_wikipedia:
            for language in flags.arg.languages:
                log.info("Import " + language + " wikipedia")
                wf.wikipedia(language=language)

        workflow.run(wf.wf)
Esempio n. 12
0
def silver_annotation():
    # Extract IDF table.
    if flags.arg.build_idf:
        wf = silver.SilverWorkflow("idf-table")
        for language in flags.arg.languages:
            log.info("Build " + language + " IDF table")
            wf.build_idf(language=language)
        workflow.run(wf.wf)

    # Run silver-labeling of Wikipedia documents.
    if flags.arg.silver_annotation:
        for language in flags.arg.languages:
            log.info("Silver-label " + language + " wikipedia")
            wf = silver.SilverWorkflow(language + "-silver")
            wf.silver_annotation(language=language)
            workflow.run(wf.wf)
Esempio n. 13
0
def refresh_task_list():
  global last_task_timestamp, tasks
  ts = os.stat(flags.arg.tasklist).st_mtime
  if ts == last_task_timestamp: return

  try:
    tasklist = {}
    store = sling.Store()
    for t in store.load(flags.arg.tasklist):
      tasklist[t.name] = Task(t)
    tasks = tasklist
  except:
    log.info("Error loading task list")
    traceback.print_exc(file=sys.stdout)
    return
  last_task_timestamp = ts
  log.info("Loaded", len(tasks), "tasks")
Esempio n. 14
0
 def read(self, parses_filename):
   reader = sling.RecordReader(parses_filename)
   self.category_name_to_qid = {}                      # category name -> qid
   self.category_frame = {}                            # category qid -> frame
   self.full_signature_to_parse = defaultdict(list)    # signature -> parse
   self.coarse_signature_to_parse = defaultdict(list)  # signature -> parse
   store = sling.Store()
   for index, (qid, value) in enumerate(reader):
     if index > 0 and index % 20000 == 0:
       log.info("%d categories read" % index)
     frame = store.parse(value)
     self.category_name_to_qid[frame.name] = qid
     self.category_frame[qid] = frame
     for parse in frame("parse"):
       element = (qid, frame, parse)
       full_signature = util.full_parse_signature(parse)
       self.full_signature_to_parse[full_signature].append(element)
       coarse_signature = util.coarse_parse_signature(parse)
       self.coarse_signature_to_parse[coarse_signature].append(element)
Esempio n. 15
0
  def __init__(self, kb, extractor):
    self.kb = kb
    self.extractor = extractor
    self.unique_properties = set()
    self.date_properties = set()

    # Collect unique-valued and date-valued properties.
    # The former will be used to compute CONFLICT counts, and the latter need to
    # be processed in a special manner while matching existing facts.
    constraint_role = kb["P2302"]
    unique = kb["Q19474404"]         # single-value constraint
    w_time = kb["/w/time"]
    for prop in kb["/w/entity"]("role"):
      if prop.target == w_time:
        self.date_properties.add(prop)
      for constraint_type in prop(constraint_role):
        if constraint_type == unique or constraint_type["is"] == unique:
          self.unique_properties.add(prop)
    log.info("%d unique-valued properties" % len(self.unique_properties))
    log.info("%d date-valued properties" % len(self.date_properties))
Esempio n. 16
0
File: run.py Progetto: yespon/sling
def train_embeddings():
    # Extract vocabulary for word embeddings.
    if flags.arg.extract_vocabulary:
        for language in flags.arg.languages:
            log.info("Extract " + language + " vocabulary")
            wf = embedding.EmbeddingWorkflow(language + "-vocabulary")
            wf.extract_vocabulary(language=language)
            workflow.run(wf.wf)

    # Train word embeddings.
    if flags.arg.train_word_embeddings:
        for language in flags.arg.languages:
            log.info("Train " + language + " word embeddings")
            wf = embedding.EmbeddingWorkflow(language + "-word-embeddings")
            wf.train_word_embeddings(language=language)
            workflow.run(wf.wf)

    # Extract vocabulary for fact and category embeddings.
    if flags.arg.extract_fact_lexicon:
        log.info("Extract fact and category lexicons")
        wf = embedding.EmbeddingWorkflow("fact-lexicon")
        wf.extract_fact_lexicon()
        workflow.run(wf.wf)

    # Extract facts from knowledge base.
    if flags.arg.extract_facts:
        log.info("Extract facts from knowledge base")
        wf = embedding.EmbeddingWorkflow("fact-extraction")
        wf.extract_facts()
        workflow.run(wf.wf)

    # Train fact and category embeddings.
    if flags.arg.train_fact_embeddings:
        log.info("Train fact and category embeddings")
        wf = embedding.EmbeddingWorkflow("fact-embeddings")
        wf.train_fact_embeddings()
        workflow.run(wf.wf)
Esempio n. 17
0
 def run(self, task):
     filename = task.input("input").name
     store = sling.Store()
     log.info("Load store from", filename)
     store.load(filename)
     log.info("Coalesce store")
     store.coalesce()
     log.info("Snapshot store")
     store.snapshot(filename)
Esempio n. 18
0
    def run(self, task):
        # Get task parameters.
        name = task.param("shortname")
        baseurl = task.param("url")
        ratelimit = task.param("ratelimit", 0)
        chunksize = task.param("chunksize", 64 * 1024)
        priority = task.param("priority", 0)
        outputs = task.outputs("output")

        log.info("Download " + name + " from " + baseurl)
        for output in outputs:
            # Make sure directory exists.
            directory = os.path.dirname(output.name)
            if not os.path.exists(directory): os.makedirs(directory)

            # Do not overwrite existing file unless flag is set.
            if not flags.arg.overwrite and os.path.exists(output.name):
                raise Exception("file already exists: " + output.name + \
                                " (use --overwrite to overwrite existing files)")

            # Hold-off on low-prio tasks
            if priority > 0: time.sleep(priority)

            # Wait until we are below the rate limit.
            global download_concurrency
            if ratelimit > 0:
                while download_concurrency >= ratelimit:
                    time.sleep(10)
                download_concurrency += 1

            # Compute url.
            if len(outputs) > 1:
                url = baseurl + "/" + os.path.basename(output.name)
            else:
                url = baseurl

            # Download from url to file.
            if ratelimit > 0: log.info("Start download of " + output.name)
            conn = urlopen(url)
            last_modified = time.mktime(
                time.strptime(conn.headers['last-modified'],
                              "%a, %d %b %Y %H:%M:%S GMT"))
            total_bytes = "bytes_downloaded"
            bytes = name + "_bytes_downloaded"
            with open(output.name, 'wb') as f:
                while True:
                    chunk = conn.read(chunksize)
                    if not chunk: break
                    f.write(chunk)
                    task.increment(total_bytes, len(chunk))
                    task.increment(bytes, len(chunk))
            os.utime(output.name, (last_modified, last_modified))
            if ratelimit > 0: download_concurrency -= 1

        log.info(name + " downloaded")
Esempio n. 19
0
def load_kb(task):
    if type(task) is str:
        filename = task  # assume filename
    else:
        filename = task.input("kb").name

    if filename in _kb_cache:
        log.info("Retrieving cached KB")
        return _kb_cache[filename]
    else:
        kb = sling.Store()
        kb.load(filename)
        log.info("Knowledge base read")
        kb.lockgc()
        kb.freeze()
        kb.unlockgc()
        log.info("Knowledge base frozen")
        _kb_cache[filename] = kb
        return kb
Esempio n. 20
0
    def __init__(self, kb, extractor):
        self.kb = kb
        self.extractor = extractor
        self.unique_properties = set()
        self.date_properties = set()
        self.location_properties = set()

        # Collect unique-valued, date-valued, and location-valued properties.
        # The former will be used to compute CONFLICT counts, and the latter need to
        # be processed in a special manner while matching existing facts.
        constraint_role = kb["P2302"]
        unique = kb["Q19474404"]  # single-value constraint
        w_time = kb["/w/time"]
        w_item = kb["/w/item"]
        p_subproperty_of = kb["P1647"]
        p_location = kb["P276"]
        for prop in kb["/w/entity"]("role"):
            if prop.target == w_time:
                self.date_properties.add(prop)
            if prop.target == w_item:
                for role, value in prop:
                    if role == p_subproperty_of:
                        if kb.resolve(value) == p_location:
                            self.location_properties.add(prop)
            for constraint_type in prop(constraint_role):
                if constraint_type == unique or constraint_type["is"] == unique:
                    self.unique_properties.add(prop)

        log.info("%d unique-valued properties" % len(self.unique_properties))
        log.info("%d date-valued properties" % len(self.date_properties))
        log.info("%d location-valued properties" %
                 len(self.location_properties))

        # Set closure properties.
        self.closure_properties = {}
        self.p_subclass = kb["P279"]
        self.p_parent_org = kb["P749"]
        p_located_in = kb["P131"]
        for p in self.location_properties:
            self.closure_properties[p] = p_located_in

        # 'Educated at' -> 'Part of'.
        self.closure_properties[kb["P69"]] = kb["P361"]
Esempio n. 21
0
def fuse_items():
    # Merge categories from wikipedias.
    if flags.arg.merge_categories:
        log.info("Merge wikipedia categories")
        wf = wiki.WikiWorkflow("category-merging")
        wf.merge_wikipedia_categories()
        run_workflow(wf)

    # Invert categories.
    if flags.arg.invert_categories:
        log.info("Invert categories")
        wf = wiki.WikiWorkflow("category-inversion")
        wf.invert_wikipedia_categories()
        run_workflow(wf)

    # Fuse items.
    if flags.arg.fuse_items:
        log.info("Fuse items")
        wf = wiki.WikiWorkflow("fuse-items")
        wf.fuse_items()
        run_workflow(wf)
Esempio n. 22
0
    def run(self, task):
        # Get task parameters.
        name = task.param("shortname")
        url = task.param("url")
        ratelimit = task.param("ratelimit", 0)
        chunksize = task.param("chunksize", 64 * 1024)
        output = task.output("output")
        log.info("Download " + name + " from " + url)

        # Make sure directory exists.
        directory = os.path.dirname(output.name)
        if not os.path.exists(directory): os.makedirs(directory)

        # Do not overwrite existing file.
        if os.path.exists(output.name):
            raise Exception("file already exists: " + output.name)

        # Wait until we are below the rate limit.
        global download_concurrency
        if ratelimit > 0:
            while download_concurrency >= ratelimit:
                time.sleep(10)
            download_concurrency += 1

        # Download from url to file.
        if ratelimit > 0: log.info("Start download of " + url)
        conn = urllib2.urlopen(url)
        total_bytes = "bytes_downloaded"
        bytes = name + "_bytes_downloaded"
        with open(output.name, 'wb') as f:
            while True:
                chunk = conn.read(chunksize)
                if not chunk: break
                f.write(chunk)
                task.increment(total_bytes, len(chunk))
                task.increment(bytes, len(chunk))
        if ratelimit > 0: download_concurrency -= 1
        log.info(name + " downloaded")
Esempio n. 23
0
def build_alias_tables():
    # Extract item names from wikidata and wikipedia.
    if flags.arg.extract_names:
        for language in flags.arg.languages:
            log.info("Extract " + language + " names")
            wf = wiki.WikiWorkflow(language + "-name-extraction")
            wf.extract_names(language=language)
            workflow.run(wf.wf)

    # Build name table.
    if flags.arg.build_nametab:
        for language in flags.arg.languages:
            log.info("Build " + language + " name table")
            wf = wiki.WikiWorkflow(language + "-name-table")
            wf.build_name_table(language=language)
            workflow.run(wf.wf)

    # Build phrase table.
    if flags.arg.build_phrasetab:
        for language in flags.arg.languages:
            log.info("Build " + language + " phrase table")
            wf = wiki.WikiWorkflow(language + "-phrase-table")
            wf.build_phrase_table(language=language)
            workflow.run(wf.wf)
Esempio n. 24
0
def extract_wikimedia():
    for language in flags.arg.languages:
        log.info("Extract " + language + " Wikipedia images")
        wf = WikiMediaWorkflow(language + "-wikimedia")
        wf.extract_media(language=language)
        run(wf.wf)
Esempio n. 25
0
def twitter_profiles():
    log.info("Extract twitter profiles")
    wf = TwitterWorkflow("twitter-profiles")
    wf.extract_twitter()
    run(wf.wf)
Esempio n. 26
0
    def run(self, task):
        # Get parameters.
        twitterdb = task.input("twitterdb").name

        # Load knowledge base.
        log.info("Load knowledge base")
        kb = sling.Store()
        kb.load(task.input("kb").name)

        p_id = kb["id"]
        p_is = kb["is"]
        p_twitter = kb["P2002"]
        p_image = kb["P18"]
        p_media = kb["media"]
        p_stated_in = kb["P248"]
        n_twitter = kb["Q918"]

        kb.freeze()

        # Open output file.
        fout = open(task.output("output").name, "w")

        # Find all items with twitter usernames.
        dbsession = requests.session()
        for item in kb:
            # Find twitter username for item.
            task.increment("items")
            imageurls = []
            for twitter in item(p_twitter):
                username = kb.resolve(twitter)
                task.increment("twitter_users")

                # Fetch twitter profile from database.
                dburl = twitterdb + "/" + urllib.parse.quote(username)
                r = dbsession.get(dburl)
                if r.status_code == 404:
                    task.increment("unknown_users")
                    continue
                r.raise_for_status()
                profile = r.json()

                # Ignore if twitter profile does not exist.
                if "error" in profile:
                    task.increment("deleted_users")
                    continue

                # Ignore if there is no profile image.
                if profile["default_profile_image"]:
                    task.increment("missing_profile_images")
                    continue

                # Get profile image url.
                imageurl = profile["profile_image_url"]

                # Get url for original image url by removing "_normal".
                imageurl = ''.join(imageurl.rsplit("_normal", 1))

                # Ignore known bad images.
                if imageurl in bad_images:
                    task.increment("bad_profile_images")
                    continue

                # Add twiter profile image to item.
                imageurls.append(imageurl)

            if len(imageurls) > 0:
                # Create item frame with twitter profile.
                store = sling.Store(kb)
                slots = [(p_id, item.id)]
                for imageurl in imageurls:
                    image = store.frame([(p_is, imageurl),
                                         (p_stated_in, n_twitter)])
                    slots.append((p_media, image))
                frame = store.frame(slots)
                fout.write(frame.data(utf8=True))
                fout.write("\n")

                task.increment("profile_images")
                if p_image not in item: task.increment("imaged_items")

        fout.close()
Esempio n. 27
0
            self._text(" (%0.4f)" % other_parse.score)
            self._br()
        self._end("td")
      self._end("tr")
    self._end("table")


if __name__ == "__main__":
  flags.define("--port",
               help="port number for the HTTP server",
               default=8001,
               type=int,
               metavar="PORT")
  flags.define("--parses",
               help="Recordio of category parses",
               default="local/data/e/wikicat/parses-with-match-statistics.rec",
               type=str,
               metavar="FILE")
  flags.define("--output",
               help="Output dir where Wikibot recordios will be generated.",
               default="local/data/e/wikicat/",
               type=str,
               metavar="DIR")
  flags.parse()
  log.info('Reading parses from %s' % flags.arg.parses)
  browser_globals.init(flags.arg.parses, flags.arg.output)
  server_address = ('', flags.arg.port)
  httpd = HTTPServer(server_address, Browser)
  log.info('Starting HTTP Server on port %d' % flags.arg.port)
  httpd.serve_forever()
Esempio n. 28
0
    def run(self, task):
        # Get parameters.
        language = task.param("language")

        # Load knowledge base.
        log.info("Load knowledge base")
        kb = sling.Store()
        kb.load(task.input("kb").name)

        n_infobox = kb["/wp/infobox"]
        n_page_item = kb["/wp/page/item"]
        n_file = kb["/wp/info/file"]
        n_media = kb["/wp/media"]

        image_fields = [
            (kb["/wp/info/image"], kb["/wp/info/caption"]),
            (kb["/wp/info/cover"], kb["/wp/info/caption"]),
            (kb["/wp/info/logo"], kb["/wp/info/logo_caption"]),
            (kb["/wp/info/photo"], kb["/wp/info/photo_caption"]),
            (kb["/wp/info/flag_image"], kb["/wp/info/flag_caption"]),
        ]

        p_media = kb["media"]
        p_id = kb["id"]
        p_is = kb["is"]
        p_imported_from = kb["P143"]
        p_media_legend = kb["P2096"]

        image_properties = [
            kb["P18"],  # image
            kb["P154"],  # logo image
            kb["P41"],  # flag image
        ]

        lang = kb["/lang/" + language]
        wikipedia_item = lang["/lang/wikilang/wikipedia"]

        docschema = sling.DocumentSchema(kb)

        kb.freeze()

        # Fetch media titles for Wikipedia from yesterday.
        log.info("Fetch local media titles")
        yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d")
        mediaurl = "https://dumps.wikimedia.org/other/mediatitles/%s/" \
          "%swiki-%s-all-media-titles.gz" % (yesterday, language, yesterday)
        r = urllib.request.urlopen(mediaurl)
        mediatitles = set(gzip.decompress(r.read()).decode().split('\n'))
        task.increment("local_media_files", len(mediatitles))

        # Open output file.
        fout = open(task.output("output").name, "w")

        # Process input articles.
        for res in task.inputs("input"):
            log.info("Extract media files from", res.name)
            for _, data in sling.RecordReader(res.name):
                # Read article into store.
                store = sling.Store(kb)
                doc = store.parse(data)
                task.increment("documents")

                # Find first infobox.
                infobox = None
                for theme in doc(docschema.document_theme):
                    if theme.isa(n_infobox):
                        infobox = theme
                        break
                if infobox is None: continue
                task.increment("infoboxes")

                # Find images in infobox.
                imagelist = []
                for n_image, n_caption in image_fields:
                    image = infobox[n_image]
                    caption = infobox[n_caption]
                    if image is None: continue

                    # Get image for repeated image field.
                    if type(image) is sling.Frame:
                        group = image
                        image = group[n_file]
                        caption = group[n_caption]
                        if image is None: continue

                    if "{" in image or "[" in image:
                        # Structured annotations.
                        annotations = sling.lex(image,
                                                store=store,
                                                schema=docschema)
                        for theme in annotations.themes:
                            if theme.isa(n_media):
                                image = theme[p_is]
                                if image is not None:
                                    imagelist.append((image, None))
                                    task.increment("structured_annotations")
                    else:
                        # Image filename.
                        imagelist.append((image, caption))
                if len(imagelist) == 0: continue

                # Process list of images for item.
                known_images = 0
                image_frames = []
                item = doc[n_page_item]
                if item is None: continue
                for image, caption in imagelist:
                    # Disregard direct URLs for now.
                    if image.startswith("http://") or \
                       image.startswith("https://") or \
                       image.startswith("//"):
                        task.increment("url_images")
                        continue

                    # Trim image name. Remove File: prefix.
                    colon = image.find(':')
                    if colon > 0 and colon < 10: image = image[colon + 1:]
                    image = titlecase(image.strip()).replace('_', ' ')
                    if len(image) == 0 or image in default_images:
                        task.increment("empty_images")
                        continue
                    if image.endswith("&lrm;"): image = image[:-5]
                    frag = image.find('#')
                    if frag > 0: image = image[:frag]
                    image = html.unescape(image)
                    image = urllib.parse.unquote(image)

                    # Discard media files with unknown or ignored extensions.
                    dot = image.rfind('.')
                    ext = image[dot:].lower() if dot > 0 else None
                    if ext in ignored_extensions:
                        task.increment("ignored_image_format")
                        continue
                    if ext not in known_extensions:
                        log.info("unknown format:", item.id, image)
                        task.increment("unknown_image_format")
                        continue

                    # Get item from KB and check if image is already known.
                    task.increment("images")
                    known = False
                    for prop in image_properties:
                        for img in item(prop):
                            img = kb.resolve(img)
                            if img == image: known = True
                            known_images += 1
                    if known:
                        task.increment("known_images")
                        continue
                    task.increment("new_images")

                    # Check if image is in local Wikipedia or Wikimedia Commons.
                    fn = image.replace(' ', '_')
                    if fn in mediatitles:
                        urlbase = "https://upload.wikimedia.org/wikipedia/" + language
                        task.increment("local_images")
                    else:
                        urlbase = "https://upload.wikimedia.org/wikipedia/commons"
                        task.increment("commons_images")
                        if known_images == 0:
                            task.increment("commons_imaged_items")

                    # Compute URL for image.
                    md5 = md5hash(fn)
                    fn = fn.replace("?", "%3F")
                    fn = fn.replace("+", "%2B")
                    fn = fn.replace("&", "%26")
                    url = "%s/%s/%s/%s" % (urlbase, md5[0], md5[0:2], fn)

                    # Create frame for item with media image.
                    slots = [
                        (p_is, url),
                        (p_imported_from, wikipedia_item),
                    ]
                    if caption != None:
                        capdoc = sling.lex(caption,
                                           store=store,
                                           schema=docschema)
                        captxt = capdoc.phrase(0, len(capdoc.tokens))
                        slots.append((p_media_legend, captxt))
                    image_frames.append(store.frame(slots))

                # Create item frame with extra image info.
                if len(image_frames) == 0: continue
                slots = [(p_id, item.id)]
                for image_frame in image_frames:
                    slots.append((p_media, image_frame))
                frame = store.frame(slots)
                fout.write(frame.data(utf8=True))
                fout.write("\n")
                if known_images == 0: task.increment("imaged_items")

        fout.close()
Esempio n. 29
0
if __name__ == '__main__':
    # Parse command-line arguments.
    flags.parse()

    if flags.arg.build_wiki:
        flags.arg.import_wikidata = True
        flags.arg.import_wikipedia = True
        flags.arg.parse_wikipedia = True
        flags.arg.merge_categories = True
        flags.arg.invert_categories = True
        flags.arg.compute_item_popularity = True
        flags.arg.fuse_items = True
        flags.arg.build_kb = True
        flags.arg.extract_names = True
        flags.arg.build_nametab = True
        flags.arg.build_phrasetab = True

    # Run workflows.
    workflow.startup()
    download_corpora()
    import_wiki()
    parse_wikipedia()
    fuse_items()
    build_knowledge_base()
    train_embeddings()
    extract_named_entities()
    workflow.shutdown()

    # Done.
    log.info("Done")
Esempio n. 30
0
def main():
    # Parse command-line arguments. Load modules for commands before parsing
    # flags to allow each of these to register more flags.
    for arg in sys.argv:
        if arg.startswith("-"): continue
        for cmd in commands:
            if arg == cmd.name:
                if cmd.package is not None:
                    importlib.import_module(cmd.package)
                if cmd.load is not None:
                    for pkg in cmd.load:
                        importlib.import_module(pkg)
                break
    flags.parse()

    # Output version information.
    if flags.arg.version:
        sling.which()
        sys.exit(0)

    # List commands.
    if flags.arg.list:
        print("commands:")
        for cmd in commands:
            if not cmd.internal:
                print("  %-30s %s" % (cmd.name, cmd.help))
        sys.exit(0)

    # Run command in background if requested.
    if flags.arg.spawn:
        # Build command.
        cmd = []
        for arg in sys.argv:
            if arg != "--spawn": cmd.append(arg)
        cmd.append("--flushlog")

        # Output to log file.
        logfn = flags.arg.logdir + "/" + time.strftime(
            "%Y%m%d-%H%M%S") + ".log"
        logfile = open(logfn, "w")

        # Start background job.
        process = subprocess.Popen(cmd,
                                   stdin=None,
                                   stdout=logfile,
                                   stderr=subprocess.STDOUT,
                                   bufsize=1,
                                   shell=False,
                                   close_fds=True)
        print("Running process", process.pid, "in background logging to",
              logfn)
        sys.exit(0)

    # Start up workflow system.
    workflow.startup()

    # Run commands.
    for cmd in commands:
        if cmd.name not in flags.arg.COMMAND: continue

        if cmd.package:
            # Load module with command.
            module = importlib.import_module(cmd.package)

            # Run command.
            if cmd.function is not None:
                log.info("Execute command " + cmd.name)
                getattr(module, cmd.function)()

        # Add triggered commands.
        if cmd.triggers is not None:
            for trigger in cmd.triggers:
                flags.arg.COMMAND.append(trigger)

    # Done.
    workflow.shutdown()
    log.info("Done")