Exemple #1
0
def validate(commons, recordio_filename, output_recordio='', options=Options()):
  schema = None
  if not isinstance(commons, sling.Store):
    assert type(commons) is str
    filename = commons
    commons = sling.Store()
    commons.load(filename)
    schema = sling.DocumentSchema(commons)
    commons.freeze()
  else:
    schema = sling.DocumentSchema(commons)

  corpus = corpora.Corpora(recordio_filename, commons, schema)
  aggregate = Results(options)
  count = 0
  writer = None
  written = 0
  if output_recordio != '':
    writer = sling.RecordWriter(output_recordio)
  for document in corpus:
    results = _validate(count, document, options)
    aggregate.add(results)
    if not results.ok() and options.stop_on_first_bad_document:
      print "Stopping after first bad document as requested"
      break
    count += 1
    if writer and results.ok():
      writer.write('', document.frame.data(binary=True))
      written += 1

  if writer:
    writer.close()

  return aggregate, count, written
Exemple #2
0
 def __init__(self, filename, commons=None):
     self.input = sling.RecordDatabase(filename)
     self.iter = iter(self.input)
     if commons == None:
         self.commons = sling.Store()
         self.docschema = sling.DocumentSchema(self.commons)
         self.commons.freeze()
     else:
         self.commons = commons
         if "document" in commons:
             self.docschema = sling.DocumentSchema(commons)
         else:
             self.docschema = None
Exemple #3
0
    def __init__(self, commons, spec):
        # Initialize document schema in commons.
        self.commons = commons
        self.schema = sling.DocumentSchema(self.commons)

        # Load analyzer.
        self.analyzer = sling.api.Analyzer(commons, spec)
Exemple #4
0
 def load_corpus(self, corpus_file):
     """Load self.corpus."""
     print("loading the corpus...")
     self.corpus = sling.Corpus(corpus_file)
     self.commons = sling.Store()
     self.docschema = sling.DocumentSchema(self.commons)
     self.commons.freeze()
 def __init__(self, commons, summary, options, schema=None):
     self.commons = commons
     if schema is None:
         schema = sling.DocumentSchema(commons)
     self.schema = schema
     self.constituency_schema = ConstituencySchema(commons)
     self.summary = summary
     self.options = options
     self.head_finder = HeadFinder(summary.statistics)
Exemple #6
0
 def load_corpus(
         self,
         corpus_file="local/data/e/wiki/en/documents-00000-of-00010.rec"):
     """ Load self.corpus """
     print("loading the corpus...")
     self.corpus = sling.Corpus(corpus_file)
     self.commons = sling.Store()
     self.docschema = sling.DocumentSchema(self.commons)
     self.commons.freeze()
Exemple #7
0
def build(recordio_filenames, output_filename, text=False):
    commons = sling.Store()
    schema = sling.DocumentSchema(commons)
    commons.freeze()

    symbol_names = {}
    symbol_names["thing"] = 1

    # Adds handle's id to 'symbol_names' if it is already not in 'commons'.
    def add(handle):
        if type(handle) is not sling.Frame or handle.id is None: return

        id_str = str(handle.id)
        if commons[id_str] is not None: return

        if id_str not in symbol_names: symbol_names[id_str] = 0
        symbol_names[id_str] += 1

    for filename in recordio_filenames:
        reader = sling.RecordReader(filename)
        for key, value in reader:
            store = sling.Store(commons)
            document = sling.Document(store.parse(value), schema=schema)

            for mention in document.mentions:
                for frame in mention.evokes():
                    for slot_role, slot_value in frame:
                        add(slot_role)
                        add(slot_value)

            for theme in document.themes:
                for slot_role, slot_value in theme:
                    add(slot_role)
                    add(slot_value)

    output = sling.Store()
    schema = sling.DocumentSchema(output)

    for name, count in symbol_names.iteritems():
        output.frame({"id": name})
    output.freeze()
    output.save(output_filename, binary=not text)
    return output, symbol_names
Exemple #8
0
    def __init__(self, filename, store=None):
        # Create new commons store for parser if needed.
        if store == None:
            self.commons = sling.Store()
        else:
            self.commons = store

        # Load parser.
        self.parser = sling.api.Parser(self.commons, filename)

        # Initialize document schema in commons.
        self.schema = sling.DocumentSchema(self.commons)

        # Freeze store if it is a private commons store for the parser.
        if store == None: self.commons.freeze()
Exemple #9
0
def extract_entity_mentions(nq_data, labelled_record):
    """Parse ourput corpus and create map from tokens to entity ids.

  Args:
    nq_data: A python dictionary containint NQ data of 1 train/dev shard
    labelled_record: Sling output document with labelled paragraphs

  Returns:
    nq_data: Original object augmented with entity maps
  """
    recin = sling.RecordReader(labelled_record)
    commons = sling.Store()
    docschema = sling.DocumentSchema(commons)
    commons.freeze()
    cnt = 1

    for key, value in recin:
        store = sling.Store(commons)
        doc = sling.Document(store.parse(value), store, docschema)
        index, ans_type, idx, ans_id = key.decode("utf-8").split("|")
        cnt += 1
        entity_map = {}

        # Parse entity mentions labelled by sling
        for m in doc.mentions:
            e = [i["is"] for i in m.evokes()]
            if not e:
                continue
            if is_sling_entity(e):
                e_val = e[0]["id"]
                if m.begin in entity_map:
                    entity_map[m.begin].append((m.end, e_val))
                else:
                    entity_map[m.begin] = [(m.end, e_val)]

        if ans_type == "annotated_long_answer":
            nq_data[index]["annotations"][int(
                idx)]["long_answer"]["entity_map"] = entity_map
        elif ans_type == "question":
            nq_data[index]["question_entity_map"] = entity_map
        elif ans_type == "annotated_short_answer":
            nq_data[index]["annotations"][int(idx)]["short_answers"][int(
                ans_id)]["entity_map"] = entity_map
        else:
            nq_data[index]["long_answer_candidates"][int(
                idx)]["entity_map"] = entity_map
    return nq_data
Exemple #10
0
def convert(brat_dir_path: str, output_file_path: str, verbose: bool = False):
    # load the brat repository
    repo = RepoModel(brat_dir_path)
    if verbose:
        print('Loaded {} document(s) from {}'.format(len(repo.documents),
                                                     brat_dir_path))

    # load the SLING commons store and the document schema
    commons = load_commons_store()
    schema = sling.DocumentSchema(commons)
    commons.freeze()

    writer = sling.RecordWriter(output_file_path)
    for document_name in repo.documents:
        document = repo.documents[document_name]
        reader = DocReader(document)
        converter = DocConverter(commons, schema, document_name)
        converter.convert(reader, writer)

    writer.close()
Exemple #11
0
    def build(self, commons_path, corpora_path):
        # Prepare lexical dictionaries.
        self.words = Lexicon(self.words_normalize_digits)
        self.suffix = Lexicon(self.words_normalize_digits, oov_item=None)

        # Initialize training corpus.
        corpora = Corpora(corpora_path, commons_path)

        # Collect word and affix lexicons.
        for document in corpora:
            for token in document.tokens:
                word = token.word
                self.words.add(word)
                for s in self.get_suffixes(word):
                    assert type(s) is str
                    self.suffix.add(s)
        print "Words:", self.words.size(), "items in lexicon, including OOV"
        print "Suffix:", self.suffix.size(), "items in lexicon"

        # Load common store, but not freeze it yet. We will add the action table
        # and cascade specification to it.
        self.commons_path = commons_path
        self.commons = sling.Store()
        self.commons.load(commons_path)
        schema = sling.DocumentSchema(self.commons)

        # Prepare action table and cascade.
        self._build_action_table(corpora)
        self.cascade = cascade.ShiftMarkCascade(self.actions)
        print self.cascade

        # Save cascade specification in commons.
        _ = self.cascade.as_frame(self.commons,
                                  delegate_cell_prefix="delegate")

        # Freeze the common store.
        self.commons.freeze()

        # Add feature specs.
        self._specify_features()
Exemple #12
0
    def __init__(self, recordio, commons, schema=None, gold=False, loop=False):
        self.filename = recordio
        self.commons_owned = False
        if isinstance(commons, str):
            self.commons = sling.Store()
            self.commons.load(commons)
            self.commons_owned = True
        else:
            assert isinstance(commons, sling.Store)
            self.commons = commons

        if schema is None or self.commons_owned:
            schema = sling.DocumentSchema(self.commons)
            if self.commons_owned:
                self.commons.freeze()
        assert schema is not None
        self.schema = schema

        self.reader = sling.RecordReader(recordio)
        self.generator = None
        self.loop = loop
        self.generator = None
        self.set_gold(gold)
            flags.define('--' + option,
                         help=option,
                         default=value,
                         type=type(value))
    flags.parse()
    for option, value in flags.arg.__dict__.items():
        if option in options.__dict__:
            options.__dict__[option] = value
            print("Setting option", option, "to", value)

    if options.doc_per_sentence:
        assert options.skip_coref, \
          "Per-sentence documents can only be output without coreference."

    commons = sling.Store()
    schema = sling.DocumentSchema(commons)

    if not flags.arg.omit_constituents:
        assert os.path.exists(flags.arg.constituency_schema)
        commons.load(flags.arg.constituency_schema)

    commons.freeze()
    writer = sling.RecordWriter(flags.arg.output)

    # Read allowed ids, if provided.
    allowed_ids = set()
    if len(options.allowed_ids_file) > 0:
        with open(options.allowed_ids_file, 'r') as f:
            for line in f:
                line = line.strip()
                if not line.endswith('.gold_conll'):
Exemple #14
0
import logging
from typing import NamedTuple, Tuple, List, Iterable
from pathlib import Path
from tqdm import tqdm
import sling
from sling.nlp.document import DocumentSchema, Token, Mention

LOGGER = logging.getLogger(__name__)
ANNOTATED_DIR = Path("/home/hiroakih/tir3/sling/local/data/e/ner/en")

# some magic commands
commons = sling.Store()
DOCSCHEMA = sling.DocumentSchema(commons)
commons.freeze()


class MyDocument(object):
    def __init__(self,
                 frame=None,
                 store=None,
                 schema=None,
                 load_tokens=True,
                 load_mentions=True):
        # Create store, frame, and schema if missing.
        if frame != None:
            store = frame.store()
        if store == None:
            store = sling.Store()
        if schema == None:
            schema = DocumentSchema(store)
        if frame == None:
Exemple #15
0
  def compare(arg):
    base_reader = sling.RecordReader(arg.base)
    expt_reader = sling.RecordReader(arg.expt)

    commons = sling.Store()
    commons.load(arg.commons)
    schema = sling.DocumentSchema(commons)
    commons.freeze()
    
    store = sling.Store(commons)
    index = -1
    for (_, base_val), (_, expt_val) in zip(base_reader, expt_reader):
      index += 1
      base_doc = sling.Document(frame=store.parse(base_val), schema=schema)
      expt_doc = sling.Document(frame=store.parse(expt_val), schema=schema)

      # Basic checks.
      base = base_doc.frame["trace"]
      expt = expt_doc.frame["trace"]
      if base is None and expt_doc is not None:
        checker.error('No trace in base document at index %d' % index)
      elif base is not None and expt_doc is None:
        checker.error('No trace in expt document at index %d' % index)
      if base is None:
        continue

      # Traces should be over the same token range.
      checker = Checker(index, base_doc, expt_doc, arg.diff)
      checker.check_eq(base["begin"], expt["begin"], "Trace Begin")
      checker.check_eq(base["end"], expt["end"], "Trace End")

      # Check LSTM features.
      base_lstm = base["/trace/lstm_features"]
      expt_lstm = expt["/trace/lstm_features"]
      checker.check_eq(len(base_lstm), len(expt_lstm), "LSTM Features Length")
      for i in range(len(base_lstm)):
        checker.frame_eq(base_lstm[i], expt_lstm[i], \
          "LSTM features for token %d (%s)" % (i, base_doc.tokens[i].word))

      # Check steps.
      base_steps = base["/trace/steps"]
      expt_steps = expt["/trace/steps"]
      min_steps = min(len(base_steps), len(expt_steps))
      for i in range(min_steps):
        message = "Step %d's current token index" % i
        checker.check_eq(base_steps[i]["/trace/current"], \
          expt_steps[i]["/trace/current"], message)

        # Check FF features for the step.
        base_ff = base_steps[i]["/trace/ff_features"]
        expt_ff = expt_steps[i]["/trace/ff_features"]
        checker.check_eq(len(base_ff), len(expt_ff), \
          "# of FF features for step %d" % i)

        base_dict = {f["/trace/feature"] : f["/trace/values"] for f in base_ff}
        expt_dict = {f["/trace/feature"] : f["/trace/values"] for f in expt_ff}
        for k, v in base_dict.items():
          checker.check_eq(k in expt_dict, True, \
            "Step %d: FF feature %s not in expt" % (i, k))
          checker.check_eq(v, expt_dict[k], \
            "Step %d: FF feature %s has a different value in expt" % (i, k))
        for k, v in expt_dict.items():
          checker.check_eq(k in base_dict, True, \
            "Step %d: FF feature %s not in base" % (i, k))

        # Check action(s) in the step.
        base_actions = base_steps[i]["/trace/actions"]
        expt_actions = expt_steps[i]["/trace/actions"]
        for idx in range(min(len(base_actions), len(expt_actions))):
          checker.frame_eq(base_actions[idx]["/trace/predicted"], \
            expt_actions[idx]["/trace/predicted"],
            "Step %d, predicted action %d" % (i, idx),
            ["/trace/_str"])
          checker.frame_eq(base_actions[idx]["/trace/final"], \
            expt_actions[idx]["/trace/final"],
            "Step %d, final action %d" % (i, idx),
            ["/trace/_str"])

        # There should be the same number of actions in the step.
        checker.check_eq(len(base_actions), len(expt_actions), \
          "Step %d: # of actions" % i)

      # There should be the same number of steps.
      checker.check_eq(len(base_steps), len(expt_steps), "# of Steps")

    base_reader.close()
    expt_reader.close()
Exemple #16
0
    def run(self, task):
        # Get parameters.
        language = task.param("language")

        # Load knowledge base.
        log.info("Load knowledge base")
        kb = sling.Store()
        kb.load(task.input("kb").name)

        n_infobox = kb["/wp/infobox"]
        n_page_item = kb["/wp/page/item"]
        n_file = kb["/wp/info/file"]
        n_media = kb["/wp/media"]

        image_fields = [
            (kb["/wp/info/image"], kb["/wp/info/caption"]),
            (kb["/wp/info/cover"], kb["/wp/info/caption"]),
            (kb["/wp/info/logo"], kb["/wp/info/logo_caption"]),
            (kb["/wp/info/photo"], kb["/wp/info/photo_caption"]),
            (kb["/wp/info/flag_image"], kb["/wp/info/flag_caption"]),
        ]

        p_media = kb["media"]
        p_id = kb["id"]
        p_is = kb["is"]
        p_imported_from = kb["P143"]
        p_media_legend = kb["P2096"]

        image_properties = [
            kb["P18"],  # image
            kb["P154"],  # logo image
            kb["P41"],  # flag image
        ]

        lang = kb["/lang/" + language]
        wikipedia_item = lang["/lang/wikilang/wikipedia"]

        docschema = sling.DocumentSchema(kb)

        kb.freeze()

        # Fetch media titles for Wikipedia from yesterday.
        log.info("Fetch local media titles")
        yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d")
        mediaurl = "https://dumps.wikimedia.org/other/mediatitles/%s/" \
          "%swiki-%s-all-media-titles.gz" % (yesterday, language, yesterday)
        r = urllib.request.urlopen(mediaurl)
        mediatitles = set(gzip.decompress(r.read()).decode().split('\n'))
        task.increment("local_media_files", len(mediatitles))

        # Open output file.
        fout = open(task.output("output").name, "w")

        # Process input articles.
        for res in task.inputs("input"):
            log.info("Extract media files from", res.name)
            for _, data in sling.RecordReader(res.name):
                # Read article into store.
                store = sling.Store(kb)
                doc = store.parse(data)
                task.increment("documents")

                # Find first infobox.
                infobox = None
                for theme in doc(docschema.document_theme):
                    if theme.isa(n_infobox):
                        infobox = theme
                        break
                if infobox is None: continue
                task.increment("infoboxes")

                # Find images in infobox.
                imagelist = []
                for n_image, n_caption in image_fields:
                    image = infobox[n_image]
                    caption = infobox[n_caption]
                    if image is None: continue

                    # Get image for repeated image field.
                    if type(image) is sling.Frame:
                        group = image
                        image = group[n_file]
                        caption = group[n_caption]
                        if image is None: continue

                    if "{" in image or "[" in image:
                        # Structured annotations.
                        annotations = sling.lex(image,
                                                store=store,
                                                schema=docschema)
                        for theme in annotations.themes:
                            if theme.isa(n_media):
                                image = theme[p_is]
                                if image is not None:
                                    imagelist.append((image, None))
                                    task.increment("structured_annotations")
                    else:
                        # Image filename.
                        imagelist.append((image, caption))
                if len(imagelist) == 0: continue

                # Process list of images for item.
                known_images = 0
                image_frames = []
                item = doc[n_page_item]
                if item is None: continue
                for image, caption in imagelist:
                    # Disregard direct URLs for now.
                    if image.startswith("http://") or \
                       image.startswith("https://") or \
                       image.startswith("//"):
                        task.increment("url_images")
                        continue

                    # Trim image name. Remove File: prefix.
                    colon = image.find(':')
                    if colon > 0 and colon < 10: image = image[colon + 1:]
                    image = titlecase(image.strip()).replace('_', ' ')
                    if len(image) == 0 or image in default_images:
                        task.increment("empty_images")
                        continue
                    if image.endswith("&lrm;"): image = image[:-5]
                    frag = image.find('#')
                    if frag > 0: image = image[:frag]
                    image = html.unescape(image)
                    image = urllib.parse.unquote(image)

                    # Discard media files with unknown or ignored extensions.
                    dot = image.rfind('.')
                    ext = image[dot:].lower() if dot > 0 else None
                    if ext in ignored_extensions:
                        task.increment("ignored_image_format")
                        continue
                    if ext not in known_extensions:
                        log.info("unknown format:", item.id, image)
                        task.increment("unknown_image_format")
                        continue

                    # Get item from KB and check if image is already known.
                    task.increment("images")
                    known = False
                    for prop in image_properties:
                        for img in item(prop):
                            img = kb.resolve(img)
                            if img == image: known = True
                            known_images += 1
                    if known:
                        task.increment("known_images")
                        continue
                    task.increment("new_images")

                    # Check if image is in local Wikipedia or Wikimedia Commons.
                    fn = image.replace(' ', '_')
                    if fn in mediatitles:
                        urlbase = "https://upload.wikimedia.org/wikipedia/" + language
                        task.increment("local_images")
                    else:
                        urlbase = "https://upload.wikimedia.org/wikipedia/commons"
                        task.increment("commons_images")
                        if known_images == 0:
                            task.increment("commons_imaged_items")

                    # Compute URL for image.
                    md5 = md5hash(fn)
                    fn = fn.replace("?", "%3F")
                    fn = fn.replace("+", "%2B")
                    fn = fn.replace("&", "%26")
                    url = "%s/%s/%s/%s" % (urlbase, md5[0], md5[0:2], fn)

                    # Create frame for item with media image.
                    slots = [
                        (p_is, url),
                        (p_imported_from, wikipedia_item),
                    ]
                    if caption != None:
                        capdoc = sling.lex(caption,
                                           store=store,
                                           schema=docschema)
                        captxt = capdoc.phrase(0, len(capdoc.tokens))
                        slots.append((p_media_legend, captxt))
                    image_frames.append(store.frame(slots))

                # Create item frame with extra image info.
                if len(image_frames) == 0: continue
                slots = [(p_id, item.id)]
                for image_frame in image_frames:
                    slots.append((p_media, image_frame))
                frame = store.frame(slots)
                fout.write(frame.data(utf8=True))
                fout.write("\n")
                if known_images == 0: task.increment("imaged_items")

        fout.close()
Exemple #17
0
 def __init__(self, filename, commons=None):
     self.input = sling.RecordReader(filename)
     self.iter = iter(self.input)
     self.commons = sling.Store() if commons == None else commons
     self.docschema = sling.DocumentSchema(self.commons)
     if commons == None: self.commons.freeze()
Exemple #18
0
    def from_flow(self, fl):
        blob = fl.blob("spec")
        temp_dict = pickle.loads(blob.data)
        self.__dict__.update(temp_dict)

        # Read non-pickled fields.
        # Read common store.
        self.commons = sling.Store()
        temp_file = tempfile.NamedTemporaryFile(delete=False)
        filename = temp_file.name
        with open(filename, "wb") as f:
            f.write(fl.blob("commons").data)
        temp_file.close()
        self.commons.load(filename)
        _ = sling.DocumentSchema(self.commons)
        self.commons.freeze()
        os.unlink(filename)

        # Read action table from the commons.
        self.actions = Actions()
        self.actions.decode(self.commons["/table"])

        # Read cascade specification. This is done by calling eval()
        # on the class constructor. The classname is stored in the cascade frame.
        frame = self.commons["/cascade"]
        self.cascade = eval(frame["name"])(self.actions)
        print self.cascade

        # Read word lexicon.
        blob = fl.blob("lexicon")
        self.words = Lexicon(self.words_normalize_digits)
        self.words.read(blob.data.tobytes(),
                        chr(int(blob.get_attr("delimiter"))))
        print self.words.size(), "words read from flow's lexicon"

        # Read suffix table.
        self.suffix = Lexicon(self.words_normalize_digits, oov_item=None)
        data = fl.blob("suffixes").data

        def read_int(mview):
            output = 0
            shift_bits = 0
            index = 0
            while index < len(mview):
                part = ord(mview[index])
                index += 1
                output |= (part & 127) << shift_bits
                shift_bits += 7
                if part & 128 == 0:
                    break
            return output, mview[index:]

        affix_type, data = read_int(data)  # affix type
        assert affix_type == 1

        max_length, data = read_int(data)  # max length
        assert max_length == self.suffixes_max_length

        num, data = read_int(data)  # num affixes
        for _ in xrange(num):
            num_bytes, data = read_int(data)
            word = data[0:num_bytes].tobytes()
            self.suffix.add(word)
            data = data[num_bytes:]
            num_chars, data = read_int(data)
            if num_chars > 0:
                shorter_index, data = read_int(data)
        print self.suffix.size(), "suffixes read from flow's affix table"