Esempio n. 1
0
  def __init__(self):
    self.site = pywikibot.Site("wikidata", "wikidata")
    self.repo = self.site.data_repository()

    time_str = datetime.datetime.now().isoformat("-")[:19].replace(":","-")
    if flags.arg.test:
      record_file_name = "local/data/e/wikibot/test-birth-dates.rec"
      time_str = "test-" + time_str
    else:
      record_file_name = "local/data/e/wikibot/birth-dates.rec"
    status_file_name = "local/logs/wikibotlog-" + time_str + ".rec"
    self.record_file = sling.RecordReader(record_file_name)
    self.status_file = sling.RecordWriter(status_file_name)

    self.store = sling.Store()
    self.n_item = self.store["item"]
    self.n_facts = self.store["facts"]
    self.n_provenance = self.store["provenance"]
    self.n_category = self.store["category"]
    self.n_method = self.store["method"]
    self.n_status = self.store["status"]
    self.n_revision = self.store["revision"]
    self.n_url = self.store["url"]
    self.n_skipped = self.store["skipped"]
    self.store.freeze()
    self.rs = sling.Store(self.store)

    self.source_claim = pywikibot.Claim(self.repo, "P3452") # inferred from
    self.time_claim = pywikibot.Claim(self.repo, "P813") # referenced (on)
    today = datetime.date.today()
    time_target = pywikibot.WbTime(year=today.year,
                                   month=today.month,
                                   day=today.day)
    self.time_claim.setTarget(time_target)
Esempio n. 2
0
def train(args):
    check_present(
        args,
        ["train_corpus", "output_folder", "dev_corpus", "train_shuffle_seed"])

    train_corpus_path = args.train_corpus
    if args.train_shuffle_seed > 0:
        reader = sling.RecordReader(args.train_corpus)
        items = [(key, value) for key, value in reader]
        reader.close()
        r = random.Random(args.train_shuffle_seed)
        r.shuffle(items)
        train_corpus_path = os.path.join(args.output_folder,
                                         "train_shuffled.rec")
        writer = sling.RecordWriter(train_corpus_path)
        for key, value in items:
            writer.write(key, value)
        writer.close()
        print("Wrote shuffled train corpus to %s using seed %d" % \
              (train_corpus_path, args.train_shuffle_seed))

    # Setting an explicit seed for the sake of determinism.
    torch.manual_seed(1)

    # Make commons store if needed.
    if args.commons == '' or not os.path.exists(args.commons):
        if args.commons == '':
            fname = os.path.join(args.output_folder, "commons")
            print("Will create a commons store at", fname)
            args.commons = fname
        else:
            print("No commons found at", args.commons, ", creating it...")
        _, symbols = commons_builder.build(
            [train_corpus_path, args.dev_corpus], args.commons)
        print("Commons created at", args.commons, "with", len(symbols), \
            "symbols besides the usual ones.")

    # Make the training spec.
    spec = Spec()
    spec.build(args.commons, train_corpus_path)

    # Initialize the model with the spec and any word embeddings.
    caspar = Caspar(spec)
    embeddings_file = args.word_embeddings
    if embeddings_file == '': embeddings_file = None
    caspar.initialize(embeddings_file)

    tmp_folder = os.path.join(args.output_folder, "tmp")
    if not os.path.exists(tmp_folder):
        os.makedirs(tmp_folder)

    evaluator = partial(dev_accuracy, args.dev_corpus, tmp_folder)

    output_file_prefix = os.path.join(args.output_folder, "caspar")
    hyperparams = Hyperparams(args)
    print("Using hyperparameters:", hyperparams)

    trainer = Trainer(caspar, hyperparams, evaluator, output_file_prefix)
    train = Corpora(train_corpus_path, spec.commons, gold=True)
    trainer.train(train)
Esempio n. 3
0
 def init(self, parses_filename, output_dir):
     self.output_dir = output_dir
     reader = sling.RecordReader(parses_filename)
     self.category_name_to_qid = {}  # category name -> qid
     self.category_frame = {}  # category qid -> frame
     self.category_parses = {}  # category qid -> parses
     self.signature_to_parse = defaultdict(list)  # signature -> parse
     self.store = sling.Store()
     self.num_parses = 0
     for index, (qid, value) in enumerate(reader):
         if (index + 1) % 20000 == 0:
             log.info("%d categories read" % index)
         qid = qid.decode('utf-8')
         frame = self.store.parse(value)
         self.category_name_to_qid[frame.name] = qid
         self.category_frame[qid] = frame
         self.category_parses[qid] = []
         for parse in frame("parse"):
             element = Parse(self.num_parses, qid, frame, parse)
             signature = util.full_parse_signature(parse)
             self.signature_to_parse[signature].append(element)
             self.category_parses[qid].append(element)
             self.num_parses += 1
     self.store.lockgc()
     self.store.freeze()
     self.store.unlockgc()
Esempio n. 4
0
 def fetch_aliases(self, alias_file_patterns):
     print("Pre-fetching all raw aliases...")
     all_aliases = {}
     for ii in range(10):
         fname = alias_file_patterns % ii
         print("reading from %s..." % fname)
         db = sling.RecordReader(fname)
         for aid, als in db:
             all_aliases[aid.decode("utf-8", errors="ignore")] = als
     return all_aliases
Esempio n. 5
0
 def process_log_data(self, files):
   no_of_files = len(files)
   file_no = 0
   rs = sling.Store(self.store)
   skipped = 0
   updated = 0
   errors = 0
   deleted = 0
   changed = 0
   for r_file in files:
     file_no += 1
     print "Processing file {:4d} of {} ({})".format(file_no,
                                                     no_of_files,
                                                     r_file)
     reader = sling.RecordReader(r_file)
     for item_str, record in reader:
       rec = rs.parse(record)
       status = rec[self.n_status]
       if self.n_skipped in status:
         skipped += 1
         continue
       elif self.n_revision not in status:
         print "ERROR - unknown status"
         errors += 1
         continue
       updated += 1
       wd_item = pywikibot.ItemPage(self.repo, item_str)
       wd_claims = wd_item.get().get('claims')
       facts = rec[self.n_facts]
       for prop, val in facts:
         p_claims =  wd_claims.get(str(prop), [])
         if not p_claims:
           deleted += 1
           continue
         for wd_claim in p_claims:
           if wd_claim.type == "time":
             date = sling.Date(val) # parse date from record
             precision = precision_map[date.precision] # sling to wikidata
             target = pywikibot.WbTime(year=date.year, precision=precision)
           elif wd_claim.type == 'wikibase-item':
             target = pywikibot.ItemPage(self.repo, val)
           else:
             # TODO add location and possibly other types
             print "Error: Unknown claim type", claim.type
             continue
           if not wd_claim.target_equals(target):
             changed += 1
     reader.close()
   print skipped, "skipped,", updated, "updated,", deleted, "deleted,", \
     changed, "changed,", errors, "error records in file"
   print "Done processing last file"
Esempio n. 6
0
    def run(self, task):
        self.init(task)

        max_parses = int(task.param("max_parses"))
        reader = sling.RecordReader(task.input("input").name)
        writer = sling.RecordWriter(task.output("output").name)
        for index, (key, value) in enumerate(reader):
            store = sling.Store(self.kb)
            category = store.parse(value)
            document = sling.Document(category.document)

            # Score each parse.
            parse_with_score = self.score(category)

            # Keep only the top-k parses.
            ranked_parses = sorted(parse_with_score, key=lambda x: -x[1])
            if len(ranked_parses) > max_parses:
                dropped = len(ranked_parses) - max_parses
                ranked_parses = ranked_parses[0:max_parses]
                task.increment("parses-dropped", dropped)
                task.increment("categories-with-too-many-parses")

            # Compute signature for each parse and store it in the parse.
            for parse, _ in ranked_parses:
                tokens, span_signature = self.signature(document, parse)
                parse["signature"] = tokens
                for span in parse.spans:
                    if span in span_signature:
                        span["signature"] = span_signature[span]

                # Also compute the coarse signature.
                tokens, span_signature = self.signature(document,
                                                        parse,
                                                        coarse=True)
                parse["coarse_signature"] = tokens
                for span in parse.spans:
                    if span in span_signature:
                        span["coarse_signature"] = span_signature[span]

            # Replace the current set of parses with the ranked list.
            del category["parse"]
            for parse, _ in ranked_parses:
                category.append("parse", parse)
            task.increment("parses-kept", len(ranked_parses))
            writer.write(key, category.data(binary=True))
        reader.close()
        writer.close()
Esempio n. 7
0
def extract_entity_mentions(nq_data, labelled_record):
    """Parse ourput corpus and create map from tokens to entity ids.

  Args:
    nq_data: A python dictionary containint NQ data of 1 train/dev shard
    labelled_record: Sling output document with labelled paragraphs

  Returns:
    nq_data: Original object augmented with entity maps
  """
    recin = sling.RecordReader(labelled_record)
    commons = sling.Store()
    docschema = sling.DocumentSchema(commons)
    commons.freeze()
    cnt = 1

    for key, value in recin:
        store = sling.Store(commons)
        doc = sling.Document(store.parse(value), store, docschema)
        index, ans_type, idx, ans_id = key.decode("utf-8").split("|")
        cnt += 1
        entity_map = {}

        # Parse entity mentions labelled by sling
        for m in doc.mentions:
            e = [i["is"] for i in m.evokes()]
            if not e:
                continue
            if is_sling_entity(e):
                e_val = e[0]["id"]
                if m.begin in entity_map:
                    entity_map[m.begin].append((m.end, e_val))
                else:
                    entity_map[m.begin] = [(m.end, e_val)]

        if ans_type == "annotated_long_answer":
            nq_data[index]["annotations"][int(
                idx)]["long_answer"]["entity_map"] = entity_map
        elif ans_type == "question":
            nq_data[index]["question_entity_map"] = entity_map
        elif ans_type == "annotated_short_answer":
            nq_data[index]["annotations"][int(idx)]["short_answers"][int(
                ans_id)]["entity_map"] = entity_map
        else:
            nq_data[index]["long_answer_candidates"][int(
                idx)]["entity_map"] = entity_map
    return nq_data
def create_train_dev_split(input_file: str, train_file: str, dev_file: str, ratio: float):
    reader = sling.RecordReader(input_file)
    frames: List[Tuple[str, str]] = []
    for key, value in reader:
        frames.append((key, value))

    shuffle(frames)

    total_count = len(frames)
    train_count = int(round((1 - ratio) * total_count))

    files = {'train': train_file, 'dev': dev_file}
    result_frames = {'train': frames[:train_count], 'dev': frames[train_count:]}
    for split in result_frames:
        writer = sling.RecordWriter(files[split])
        for (key, value) in result_frames[split]:
            writer.write(key, value)
        writer.close()
Esempio n. 9
0
def build(recordio_filenames, output_filename, text=False):
    commons = sling.Store()
    schema = sling.DocumentSchema(commons)
    commons.freeze()

    symbol_names = {}
    symbol_names["thing"] = 1

    # Adds handle's id to 'symbol_names' if it is already not in 'commons'.
    def add(handle):
        if type(handle) is not sling.Frame or handle.id is None: return

        id_str = str(handle.id)
        if commons[id_str] is not None: return

        if id_str not in symbol_names: symbol_names[id_str] = 0
        symbol_names[id_str] += 1

    for filename in recordio_filenames:
        reader = sling.RecordReader(filename)
        for key, value in reader:
            store = sling.Store(commons)
            document = sling.Document(store.parse(value), schema=schema)

            for mention in document.mentions:
                for frame in mention.evokes():
                    for slot_role, slot_value in frame:
                        add(slot_role)
                        add(slot_value)

            for theme in document.themes:
                for slot_role, slot_value in theme:
                    add(slot_role)
                    add(slot_value)

    output = sling.Store()
    schema = sling.DocumentSchema(output)

    for name, count in symbol_names.iteritems():
        output.frame({"id": name})
    output.freeze()
    output.save(output_filename, binary=not text)
    return output, symbol_names
Esempio n. 10
0
 def read(self, parses_filename):
   reader = sling.RecordReader(parses_filename)
   self.category_name_to_qid = {}                      # category name -> qid
   self.category_frame = {}                            # category qid -> frame
   self.full_signature_to_parse = defaultdict(list)    # signature -> parse
   self.coarse_signature_to_parse = defaultdict(list)  # signature -> parse
   store = sling.Store()
   for index, (qid, value) in enumerate(reader):
     if index > 0 and index % 20000 == 0:
       log.info("%d categories read" % index)
     frame = store.parse(value)
     self.category_name_to_qid[frame.name] = qid
     self.category_frame[qid] = frame
     for parse in frame("parse"):
       element = (qid, frame, parse)
       full_signature = util.full_parse_signature(parse)
       self.full_signature_to_parse[full_signature].append(element)
       coarse_signature = util.coarse_parse_signature(parse)
       self.coarse_signature_to_parse[coarse_signature].append(element)
Esempio n. 11
0
 def run(self, task):
   self.init(task)
   reader = sling.RecordReader(task.input("parses").name)
   writer = sling.RecordWriter(task.output("output").name)
   for key, value in reader:
     store = sling.Store(self.kb)
     category = store.parse(value)
     matches = self.matcher.for_parses(category, store, max_evidences=-1)
     frame_cache = {}   # (pid, qid) -> frame containing their match statistics
     for parse, parse_match in zip(category("parse"), matches):
       for span, span_match in zip(parse.spans, parse_match):
         span_key = (span.pids, span.qid)
         if span_key not in frame_cache:
           match_frame = span_match.as_frame(store)
           frame_cache[span_key] = match_frame
         span["fact_matches"] = frame_cache[span_key]
     writer.write(key, category.data(binary=True))
     task.increment("fact-matcher/categories-processed")
   reader.close()
   writer.close()
Esempio n. 12
0
def load(
    record: str,
    load_tokens: bool = True,
    load_mentions: bool = True
) -> Iterable[Tuple[sling.nlp.document.Document, Tuple[int, str, str]]]:
    """load documents from a .rec file.
    Warning: this may take good amount of RAM space (each *.rec file is 5.3GB).
    """
    for k, rec in sling.RecordReader(record):
        store = sling.Store(commons)
        # parse record into frame
        doc_frame = store.parse(rec)
        # instantiate a document
        #parsed_doc = sling.Document(doc_frame, store, DOCSCHEMA)
        parsed_doc = MyDocument(doc_frame,
                                store,
                                DOCSCHEMA,
                                load_tokens=load_tokens,
                                load_mentions=load_mentions)
        metadata = get_metadata(doc_frame)
        yield parsed_doc, metadata
Esempio n. 13
0
def read_corpus(file_pattern):
    docs = []
    if file_pattern.endswith(".zip"):
        with gfile.GFile(file_pattern, 'r') as f:
            buf = io.BytesIO(f.read())
            with zipfile.ZipFile(buf, 'r') as zipreader:
                docs = [None] * len(zipreader.namelist())
                for index, fname in enumerate(zipreader.namelist()):
                    docs[index] = zipreader.read(fname)
    elif file_pattern.endswith(".rec"):
        reader = sling.RecordReader(file_pattern)
        for _, value in reader:
            docs.append(value)
        reader.close()
    else:
        filenames = gfile.Glob(file_pattern)
        docs = [None] * len(filenames)
        for index, name in enumerate(filenames):
            with gfile.GFile(name, 'r') as f:
                docs[index] = f.read()
    print len(docs), "files in", file_pattern
    return docs
Esempio n. 14
0
    def __init__(self, recordio, commons, schema=None, gold=False, loop=False):
        self.filename = recordio
        self.commons_owned = False
        if isinstance(commons, str):
            self.commons = sling.Store()
            self.commons.load(commons)
            self.commons_owned = True
        else:
            assert isinstance(commons, sling.Store)
            self.commons = commons

        if schema is None or self.commons_owned:
            schema = sling.DocumentSchema(self.commons)
            if self.commons_owned:
                self.commons.freeze()
        assert schema is not None
        self.schema = schema

        self.reader = sling.RecordReader(recordio)
        self.generator = None
        self.loop = loop
        self.generator = None
        self.set_gold(gold)
Esempio n. 15
0
  def compare(arg):
    base_reader = sling.RecordReader(arg.base)
    expt_reader = sling.RecordReader(arg.expt)

    commons = sling.Store()
    commons.load(arg.commons)
    schema = sling.DocumentSchema(commons)
    commons.freeze()
    
    store = sling.Store(commons)
    index = -1
    for (_, base_val), (_, expt_val) in zip(base_reader, expt_reader):
      index += 1
      base_doc = sling.Document(frame=store.parse(base_val), schema=schema)
      expt_doc = sling.Document(frame=store.parse(expt_val), schema=schema)

      # Basic checks.
      base = base_doc.frame["trace"]
      expt = expt_doc.frame["trace"]
      if base is None and expt_doc is not None:
        checker.error('No trace in base document at index %d' % index)
      elif base is not None and expt_doc is None:
        checker.error('No trace in expt document at index %d' % index)
      if base is None:
        continue

      # Traces should be over the same token range.
      checker = Checker(index, base_doc, expt_doc, arg.diff)
      checker.check_eq(base["begin"], expt["begin"], "Trace Begin")
      checker.check_eq(base["end"], expt["end"], "Trace End")

      # Check LSTM features.
      base_lstm = base["/trace/lstm_features"]
      expt_lstm = expt["/trace/lstm_features"]
      checker.check_eq(len(base_lstm), len(expt_lstm), "LSTM Features Length")
      for i in range(len(base_lstm)):
        checker.frame_eq(base_lstm[i], expt_lstm[i], \
          "LSTM features for token %d (%s)" % (i, base_doc.tokens[i].word))

      # Check steps.
      base_steps = base["/trace/steps"]
      expt_steps = expt["/trace/steps"]
      min_steps = min(len(base_steps), len(expt_steps))
      for i in range(min_steps):
        message = "Step %d's current token index" % i
        checker.check_eq(base_steps[i]["/trace/current"], \
          expt_steps[i]["/trace/current"], message)

        # Check FF features for the step.
        base_ff = base_steps[i]["/trace/ff_features"]
        expt_ff = expt_steps[i]["/trace/ff_features"]
        checker.check_eq(len(base_ff), len(expt_ff), \
          "# of FF features for step %d" % i)

        base_dict = {f["/trace/feature"] : f["/trace/values"] for f in base_ff}
        expt_dict = {f["/trace/feature"] : f["/trace/values"] for f in expt_ff}
        for k, v in base_dict.items():
          checker.check_eq(k in expt_dict, True, \
            "Step %d: FF feature %s not in expt" % (i, k))
          checker.check_eq(v, expt_dict[k], \
            "Step %d: FF feature %s has a different value in expt" % (i, k))
        for k, v in expt_dict.items():
          checker.check_eq(k in base_dict, True, \
            "Step %d: FF feature %s not in base" % (i, k))

        # Check action(s) in the step.
        base_actions = base_steps[i]["/trace/actions"]
        expt_actions = expt_steps[i]["/trace/actions"]
        for idx in range(min(len(base_actions), len(expt_actions))):
          checker.frame_eq(base_actions[idx]["/trace/predicted"], \
            expt_actions[idx]["/trace/predicted"],
            "Step %d, predicted action %d" % (i, idx),
            ["/trace/_str"])
          checker.frame_eq(base_actions[idx]["/trace/final"], \
            expt_actions[idx]["/trace/final"],
            "Step %d, final action %d" % (i, idx),
            ["/trace/_str"])

        # There should be the same number of actions in the step.
        checker.check_eq(len(base_actions), len(expt_actions), \
          "Step %d: # of actions" % i)

      # There should be the same number of steps.
      checker.check_eq(len(base_steps), len(expt_steps), "# of Steps")

    base_reader.close()
    expt_reader.close()
Esempio n. 16
0
    def run(self, task):
        # Get parameters.
        language = task.param("language")

        # Load knowledge base.
        log.info("Load knowledge base")
        kb = sling.Store()
        kb.load(task.input("kb").name)

        n_infobox = kb["/wp/infobox"]
        n_page_item = kb["/wp/page/item"]
        n_file = kb["/wp/info/file"]
        n_media = kb["/wp/media"]

        image_fields = [
            (kb["/wp/info/image"], kb["/wp/info/caption"]),
            (kb["/wp/info/cover"], kb["/wp/info/caption"]),
            (kb["/wp/info/logo"], kb["/wp/info/logo_caption"]),
            (kb["/wp/info/photo"], kb["/wp/info/photo_caption"]),
            (kb["/wp/info/flag_image"], kb["/wp/info/flag_caption"]),
        ]

        p_media = kb["media"]
        p_id = kb["id"]
        p_is = kb["is"]
        p_imported_from = kb["P143"]
        p_media_legend = kb["P2096"]

        image_properties = [
            kb["P18"],  # image
            kb["P154"],  # logo image
            kb["P41"],  # flag image
        ]

        lang = kb["/lang/" + language]
        wikipedia_item = lang["/lang/wikilang/wikipedia"]

        docschema = sling.DocumentSchema(kb)

        kb.freeze()

        # Fetch media titles for Wikipedia from yesterday.
        log.info("Fetch local media titles")
        yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d")
        mediaurl = "https://dumps.wikimedia.org/other/mediatitles/%s/" \
          "%swiki-%s-all-media-titles.gz" % (yesterday, language, yesterday)
        r = urllib.request.urlopen(mediaurl)
        mediatitles = set(gzip.decompress(r.read()).decode().split('\n'))
        task.increment("local_media_files", len(mediatitles))

        # Open output file.
        fout = open(task.output("output").name, "w")

        # Process input articles.
        for res in task.inputs("input"):
            log.info("Extract media files from", res.name)
            for _, data in sling.RecordReader(res.name):
                # Read article into store.
                store = sling.Store(kb)
                doc = store.parse(data)
                task.increment("documents")

                # Find first infobox.
                infobox = None
                for theme in doc(docschema.document_theme):
                    if theme.isa(n_infobox):
                        infobox = theme
                        break
                if infobox is None: continue
                task.increment("infoboxes")

                # Find images in infobox.
                imagelist = []
                for n_image, n_caption in image_fields:
                    image = infobox[n_image]
                    caption = infobox[n_caption]
                    if image is None: continue

                    # Get image for repeated image field.
                    if type(image) is sling.Frame:
                        group = image
                        image = group[n_file]
                        caption = group[n_caption]
                        if image is None: continue

                    if "{" in image or "[" in image:
                        # Structured annotations.
                        annotations = sling.lex(image,
                                                store=store,
                                                schema=docschema)
                        for theme in annotations.themes:
                            if theme.isa(n_media):
                                image = theme[p_is]
                                if image is not None:
                                    imagelist.append((image, None))
                                    task.increment("structured_annotations")
                    else:
                        # Image filename.
                        imagelist.append((image, caption))
                if len(imagelist) == 0: continue

                # Process list of images for item.
                known_images = 0
                image_frames = []
                item = doc[n_page_item]
                if item is None: continue
                for image, caption in imagelist:
                    # Disregard direct URLs for now.
                    if image.startswith("http://") or \
                       image.startswith("https://") or \
                       image.startswith("//"):
                        task.increment("url_images")
                        continue

                    # Trim image name. Remove File: prefix.
                    colon = image.find(':')
                    if colon > 0 and colon < 10: image = image[colon + 1:]
                    image = titlecase(image.strip()).replace('_', ' ')
                    if len(image) == 0 or image in default_images:
                        task.increment("empty_images")
                        continue
                    if image.endswith("&lrm;"): image = image[:-5]
                    frag = image.find('#')
                    if frag > 0: image = image[:frag]
                    image = html.unescape(image)
                    image = urllib.parse.unquote(image)

                    # Discard media files with unknown or ignored extensions.
                    dot = image.rfind('.')
                    ext = image[dot:].lower() if dot > 0 else None
                    if ext in ignored_extensions:
                        task.increment("ignored_image_format")
                        continue
                    if ext not in known_extensions:
                        log.info("unknown format:", item.id, image)
                        task.increment("unknown_image_format")
                        continue

                    # Get item from KB and check if image is already known.
                    task.increment("images")
                    known = False
                    for prop in image_properties:
                        for img in item(prop):
                            img = kb.resolve(img)
                            if img == image: known = True
                            known_images += 1
                    if known:
                        task.increment("known_images")
                        continue
                    task.increment("new_images")

                    # Check if image is in local Wikipedia or Wikimedia Commons.
                    fn = image.replace(' ', '_')
                    if fn in mediatitles:
                        urlbase = "https://upload.wikimedia.org/wikipedia/" + language
                        task.increment("local_images")
                    else:
                        urlbase = "https://upload.wikimedia.org/wikipedia/commons"
                        task.increment("commons_images")
                        if known_images == 0:
                            task.increment("commons_imaged_items")

                    # Compute URL for image.
                    md5 = md5hash(fn)
                    fn = fn.replace("?", "%3F")
                    fn = fn.replace("+", "%2B")
                    fn = fn.replace("&", "%26")
                    url = "%s/%s/%s/%s" % (urlbase, md5[0], md5[0:2], fn)

                    # Create frame for item with media image.
                    slots = [
                        (p_is, url),
                        (p_imported_from, wikipedia_item),
                    ]
                    if caption != None:
                        capdoc = sling.lex(caption,
                                           store=store,
                                           schema=docschema)
                        captxt = capdoc.phrase(0, len(capdoc.tokens))
                        slots.append((p_media_legend, captxt))
                    image_frames.append(store.frame(slots))

                # Create item frame with extra image info.
                if len(image_frames) == 0: continue
                slots = [(p_id, item.id)]
                for image_frame in image_frames:
                    slots.append((p_media, image_frame))
                frame = store.frame(slots)
                fout.write(frame.data(utf8=True))
                fout.write("\n")
                if known_images == 0: task.increment("imaged_items")

        fout.close()
Esempio n. 17
0
import random
import sling
import sling.flags as flags

flags.define('--input', help='input file with documents')
flags.define('--output', help='output for shuffled documents')
flags.define('--seed',
             help='seed for shuffling the corpus',
             default="314159",
             type=int,
             metavar='NUM')

if __name__ == '__main__':
    flags.parse()

    # Read input corpus.
    reader = sling.RecordReader(flags.arg.input)
    records = [(key, value) for key, value in reader]
    reader.close()

    # Shufle documents.
    r = random.Random(flags.arg.seed)
    r.shuffle(records)

    # Write shuffled documents to output.
    writer = sling.RecordWriter(flags.arg.output)
    for key, value in records:
        writer.write(key, value)
    writer.close()
Esempio n. 18
0
    def run(self, task):
        self.init(task)

        writer = sling.RecordWriter(task.output("output").name)
        rejected = sling.RecordWriter(task.output("rejected").name)
        inputs = [t.name for t in task.inputs("items")]

        for filename in inputs:
            reader = sling.RecordReader(filename)
            for index, (key, value) in enumerate(reader):
                store = sling.Store(self.kb)
                frame = store.parse(value)

                # Only process category items.
                if not self.is_category(frame):
                    rejected.write(key, "not_category")
                    continue

                # See if the category should be skipped.
                members = self.get_members(frame)
                reject, reason = self.reject(key, frame, members)
                if reject:
                    task.increment("skipped_categories/" + reason)
                    rejected.write(key, reason)
                    continue

                # First, collect the targets of all facts of all category members.
                qp_counts = self.qid_pid_counts(store, members)

                # Next, tokenize the category title.
                title = self.get_title(frame)
                colon = title.find(':')
                title = title[colon + 1:]
                document = sling.tokenize(title, store)

                # Next, find matches for all spans. These are reported as a list,
                # where ith item = spans that begin at token i (possibly an empty list).
                begin_to_spans = self.compute_spans(document, qp_counts)

                # Construct maximal parses with non-overlapping spans.
                parses = self.construct_parses(begin_to_spans)

                # Post-process parses.
                parses = self.post_process(parses)
                if len(parses) == 0 or len(parses) == 1 and len(
                        parses[0]) == 0:
                    task.increment("skipped_categories/no_parses")
                    rejected.write(key, "no_parses")
                    continue

                # Write parses as frames.
                frame = store.frame({"name": title, "members": members})
                frame["document"] = document.frame
                for parse in parses:
                    span_array = store.array(len(parse))
                    for i, span in enumerate(parse):
                        span_array[i] = store.frame({
                            "begin": span.begin,
                            "end": span.end,
                            "qid": span.qid,
                            "prior": span.prior,
                            "pids": list(span.pids),
                            "count": span.count
                        })
                    parse_frame = store.frame({"spans": span_array})
                    frame.append("parse", parse_frame)
                writer.write(key, frame.data(binary=True))
                task.increment("categories_accepted")

                # Compute histogram over number of parses.
                for b in self.num_parses_bins:
                    if len(parses) <= b:
                        task.increment("#parses <= %d" % b)
                if self.num_parses_bins[-1] < len(parses):
                    task.increment("#parses > %d" % self.num_parses_bins[-1])

            reader.close()
        writer.close()
        rejected.close()
Esempio n. 19
0
    def __init__(self):
        self.site = pywikibot.Site("wikidata", "wikidata")
        self.repo = self.site.data_repository()

        time_str = datetime.datetime.now().isoformat("-")[:19].replace(
            ":", "-")
        if flags.arg.test:
            record_file_name = "local/data/e/wikibot/test-" + flags.arg.input + ".rec"
            time_str = "test-" + time_str
        else:
            record_file_name = "local/data/e/wikibot/" + flags.arg.input + ".rec"
        status_file_name = "local/logs/wikibotlog-" + time_str + ".rec"
        self.record_file = sling.RecordReader(record_file_name)
        self.status_file = sling.RecordWriter(status_file_name)

        self.store = sling.Store()
        self.store.lockgc()
        print("loading kb")
        self.store.load("local/data/e/wiki/kb.sling")
        print("kb loaded")

        self.page_cat = self.store["/wp/page/category"]

        self.date_of_birth = self.store['P569']
        self.date_of_death = self.store['P570']

        self.n_item = self.store["item"]
        self.n_facts = self.store["facts"]
        self.n_provenance = self.store["provenance"]
        self.n_category = self.store["category"]
        self.n_url = self.store["url"]
        self.n_method = self.store["method"]
        self.n_status = self.store["status"]
        self.n_revision = self.store["revision"]
        self.n_skipped = self.store["skipped"]
        self.store.freeze()
        self.rs = sling.Store(self.store)

        self.wiki = {'fr': 'Q8447',    'en': 'Q328',    'da': 'Q181163', \
                     'pt': 'Q11921',   'fi': 'Q175482', 'es': 'Q8449', \
                     'pl': 'Q1551807', 'de': 'Q48183',  'nl': 'Q10000', \
                     'sv': 'Q169514',  'it': 'Q11920',  'no': 'Q191769'}
        self.languages = self.wiki.keys()
        self.wiki_sources = {}
        for lang, wp in self.wiki.items():
            # P143 means 'imported from Wikimedia project'
            source_claim = pywikibot.Claim(self.repo, "P143")
            target = pywikibot.ItemPage(self.repo, wp)
            source_claim.setTarget(target)
            self.wiki_sources[lang] = source_claim
        self.record_db = {}
        fname = "local/data/e/wiki/{}/[email protected]"
        for lang in self.languages:
            self.record_db[lang] = sling.RecordDatabase(fname.format(lang))

        # inferred from
        self.source_claim = pywikibot.Claim(self.repo, "P3452")
        # Wikimedia import URL
        self.url_source_claim = pywikibot.Claim(self.repo, "P4656")
        # imported from Wikimedia project
        self.wp_source_claim = pywikibot.Claim(self.repo, "P143")
        self.en_wp = pywikibot.ItemPage(self.repo, "Q328")
        self.wp_source_claim.setTarget(self.en_wp)

        # referenced (on)
        self.time_claim = pywikibot.Claim(self.repo, "P813")
        today = datetime.date.today()
        time_target = pywikibot.WbTime(year=today.year,
                                       month=today.month,
                                       day=today.day)
        self.time_claim.setTarget(time_target)

        self.uniq_prop = {self.date_of_birth, self.date_of_death}
        kb = self.store
        # Collect unique-valued properties.
        # They will be used to update claims in Wikidata accordingly.
        constraint_role = kb["P2302"]
        unique = kb["Q19474404"]  # single-value constraint
        for prop in kb["/w/entity"]("role"):
            for constraint_type in prop(constraint_role):
                if kb.resolve(constraint_type) == unique:
                    self.uniq_prop.add(prop)
Esempio n. 20
0
 def process_log_data(self, files):
   no_of_files = len(files)
   file_no = 0
   rs = sling.Store(self.store)
   skipped = 0
   updated = 0
   errors = 0
   deleted = 0
   changed = 0
   redirected = 0
   updates = {}
   for r_file in files:
     file_no += 1
     print "Processing file {:4d} of {} ({})".format(file_no,
                                                     no_of_files,
                                                     r_file)
     print r_file
     reader = sling.RecordReader(r_file)
     last_updated = updated
     for item_str, record in reader:
       rec = rs.parse(record)
       status = rec[self.n_status]
       if self.n_skipped in status:
         skipped += 1
         continue
       elif self.n_revision not in status:
         print "ERROR - unknown status"
         errors += 1
         continue
       updated += 1
       wd_item = pywikibot.ItemPage(self.repo, item_str)
       if wd_item.isRedirectPage():
         redirected += 1
         continue
       wd_claims = wd_item.get().get('claims')
       facts = rec[self.n_facts]
       for prop, val in facts:
         p_claims =  wd_claims.get(str(prop), [])
         if not p_claims:
           deleted += 1
           continue
         for wd_claim in p_claims:
           if wd_claim.type == "time":
             date = sling.Date(val) # parse date from record
             precision = precision_map[date.precision] # sling to wikidata
             target = pywikibot.WbTime(year=date.year, precision=precision)
           elif wd_claim.type == 'wikibase-item':
             target = pywikibot.ItemPage(self.repo, val)
           else:
             # TODO add location and possibly other types
             print "Error: Unknown claim type", claim.type
             continue
           if not wd_claim.target_equals(target):
             print item_str, target, wd_claim.target
             changed += 1
     reader.close()
     print updated - last_updated
     f = r_file.split("-")
     date = int(f[1] + f[2] + f[3])
     if date not in updates: updates[date] = 0
     updates[date] += (updated - last_updated)
   print skipped, "skipped,", updated, "updated,", deleted, "deleted,", \
     changed, "changed,", errors, "error records in file"
   print "Done processing last file"
   # Print number of accumulated updates over time
   first = min(updates)
   acc_upd = 0
   d = datetime.date(first / 10000, (first % 10000) / 100, first % 100)
   while d <= datetime.date.today():
     num = d.year * 10000 + d.month * 100 + d.day
     if num in updates: acc_upd += updates[num]
     print d.strftime("%Y-%m-%d") + "," + str(acc_upd)
     d += datetime.timedelta(days = 1)
Esempio n. 21
0
 def __init__(self, filename, commons=None):
     self.input = sling.RecordReader(filename)
     self.iter = iter(self.input)
     self.commons = sling.Store() if commons == None else commons
     self.docschema = sling.DocumentSchema(self.commons)
     if commons == None: self.commons.freeze()
Esempio n. 22
0
    def run(self):
        month = "(" + "|".join(self.months.keys()) + ")"
        day = "(\d{1,2})"
        year = "(\d{4})"
        date = "(?:(?:" + day + " " + month + " " + year + ")|"
        date += "(?:" + month + " " + day + ", " + year + "))"
        date += "(?:[^)]+?)?"
        dates = date + u"\s*-+\s*" + date
        dates = u"(?:(?:(?:born|b\.|n\xe9e),? ([^0-9)]*?)" + date + \
          "(?:(?:died|d\.),? [^0-9)]*?" + date + ")?)|(?:" + dates + "))"
        pat = "(?:[^(]|\([^0-9]*\))*?\([^0-9)]*?" + dates + "\s*\)"
        rec = re.compile(pat)

        self.out_file = "local/data/e/wikibot/birth-death-dates.rec"
        record_file = sling.RecordWriter(self.out_file)
        records = 0
        store = sling.Store(self.kb)

        for i in range(10):
            i_file = "local/data/e/wiki/en/documents-0000" + str(
                i) + "-of-00010.rec"
            print i_file, records
            for (item_id, record) in sling.RecordReader(i_file):
                item = self.kb[item_id]
                if self.human not in item(self.instanceof): continue
                if self.precise_date(item(self.date_of_birth)) and \
                   self.precise_date(item(self.date_of_death)):
                    continue
                parsed_record = sling.Store().parse(record)
                doc = sling.Document(parsed_record)
                raw_text = parsed_record['text']
                if len(raw_text) == 0: continue
                start_index = raw_text.find("<b>") + len("<b>")
                first = 1
                while first < len(doc.tokens) and \
                      doc.tokens[first].start <= start_index:
                    first += 1
                last = first
                while last < len(doc.tokens) and doc.tokens[last].brk < 3:
                    last += 1
                text = doc.phrase(max(0, first - 1),
                                  min(len(doc.tokens), last + 15))
                m = rec.match(text)
                if m is None: continue
                if text.find("(baptised") >= 0 or text.find("throne") >= 0:
                    continue
                if text.find("(baptized") >= 0 or text.find("partner") >= 0:
                    continue
                if m.group(2) or m.group(5):
                    first = self.date_from_match(1, m)
                    if first.year < 1753:
                        continue  # possibly Julian calendar date
                    if m.group(8) or m.group(11):
                        second = self.date_from_match(7, m)
                        if second.year < 1753:
                            continue  # possibly Julian calendar date
                        facts = store.frame({
                            self.date_of_birth: first.value(),
                            self.date_of_death: second.value()
                        })
                    else:
                        # Only one date match
                        mg1 = m.group(1)
                        dob = item(self.date_of_birth)
                        dod = item(self.date_of_death)
                        if mg1 and max(mg1.find("died"), mg1.find("d.")) >= 0:
                            # death date only
                            if self.precise_date(dod): continue
                            if self.same_year(first.year, dob):
                                continue  # b&d too close
                            facts = store.frame({
                                self.date_of_death:
                                first.value(),
                            })
                        else:
                            # birth date only
                            if self.precise_date(dob): continue
                            if self.same_year(first.year, dod):
                                continue  # b&d too close
                            facts = store.frame({
                                self.date_of_birth:
                                first.value(),
                            })
                else:
                    first = self.date_from_match(13, m)
                    second = self.date_from_match(19, m)
                    if min(first.year, second.year) < 1753:
                        continue  # possibly Julian
                    facts = store.frame({
                        self.date_of_birth: first.value(),
                        self.date_of_death: second.value()
                    })
                records += 1
                provenance = store.frame({
                    self.url:
                    parsed_record['url'],
                    self.method:
                    "English Wikipedia dates for '" + str(item.name) + "'"
                })
                fact = store.frame({
                    self.item: item,
                    self.facts: facts,
                    self.provenance: provenance
                })
                record_file.write(item.id, fact.data(binary=True))
        record_file.close()
        print records, "birth/death date records written to file:", self.out_file
Esempio n. 23
0
{=text =/s/document/text}
{=tokens =/s/document/tokens}
{=mention =/s/document/mention}
{=theme =/s/document/theme}
{=token =/s/token}
{=index =/s/token/index}
{=start =/s/token/start}
{=size =/s/token/length}
{=break =/s/token/break}
{=word =/s/token/text}
{=phrase =/s/phrase}
{=begin =/s/phrase/begin}
{=length =/s/phrase/length}
{=evokes =/s/phrase/evokes}
""")
commons.freeze()

# Convert documents.
num_docs = 0
fin = sling.RecordReader(sys.argv[1])
fout = sling.RecordWriter(sys.argv[2])
for key, value in fin:
    store = sling.Store(commons)
    f = store.parse(value)
    fout.write(key, f.data(binary=True))
    num_docs += 1

fin.close()
fout.close()
print num_docs, "documents converted"