def validate(commons, recordio_filename, output_recordio='', options=Options()): schema = None if not isinstance(commons, sling.Store): assert type(commons) is str filename = commons commons = sling.Store() commons.load(filename) schema = sling.DocumentSchema(commons) commons.freeze() else: schema = sling.DocumentSchema(commons) corpus = corpora.Corpora(recordio_filename, commons, schema) aggregate = Results(options) count = 0 writer = None written = 0 if output_recordio != '': writer = sling.RecordWriter(output_recordio) for document in corpus: results = _validate(count, document, options) aggregate.add(results) if not results.ok() and options.stop_on_first_bad_document: print "Stopping after first bad document as requested" break count += 1 if writer and results.ok(): writer.write('', document.frame.data(binary=True)) written += 1 if writer: writer.close() return aggregate, count, written
def __init__(self, filename, commons=None): self.input = sling.RecordDatabase(filename) self.iter = iter(self.input) if commons == None: self.commons = sling.Store() self.docschema = sling.DocumentSchema(self.commons) self.commons.freeze() else: self.commons = commons if "document" in commons: self.docschema = sling.DocumentSchema(commons) else: self.docschema = None
def __init__(self, commons, spec): # Initialize document schema in commons. self.commons = commons self.schema = sling.DocumentSchema(self.commons) # Load analyzer. self.analyzer = sling.api.Analyzer(commons, spec)
def load_corpus(self, corpus_file): """Load self.corpus.""" print("loading the corpus...") self.corpus = sling.Corpus(corpus_file) self.commons = sling.Store() self.docschema = sling.DocumentSchema(self.commons) self.commons.freeze()
def __init__(self, commons, summary, options, schema=None): self.commons = commons if schema is None: schema = sling.DocumentSchema(commons) self.schema = schema self.constituency_schema = ConstituencySchema(commons) self.summary = summary self.options = options self.head_finder = HeadFinder(summary.statistics)
def load_corpus( self, corpus_file="local/data/e/wiki/en/documents-00000-of-00010.rec"): """ Load self.corpus """ print("loading the corpus...") self.corpus = sling.Corpus(corpus_file) self.commons = sling.Store() self.docschema = sling.DocumentSchema(self.commons) self.commons.freeze()
def build(recordio_filenames, output_filename, text=False): commons = sling.Store() schema = sling.DocumentSchema(commons) commons.freeze() symbol_names = {} symbol_names["thing"] = 1 # Adds handle's id to 'symbol_names' if it is already not in 'commons'. def add(handle): if type(handle) is not sling.Frame or handle.id is None: return id_str = str(handle.id) if commons[id_str] is not None: return if id_str not in symbol_names: symbol_names[id_str] = 0 symbol_names[id_str] += 1 for filename in recordio_filenames: reader = sling.RecordReader(filename) for key, value in reader: store = sling.Store(commons) document = sling.Document(store.parse(value), schema=schema) for mention in document.mentions: for frame in mention.evokes(): for slot_role, slot_value in frame: add(slot_role) add(slot_value) for theme in document.themes: for slot_role, slot_value in theme: add(slot_role) add(slot_value) output = sling.Store() schema = sling.DocumentSchema(output) for name, count in symbol_names.iteritems(): output.frame({"id": name}) output.freeze() output.save(output_filename, binary=not text) return output, symbol_names
def __init__(self, filename, store=None): # Create new commons store for parser if needed. if store == None: self.commons = sling.Store() else: self.commons = store # Load parser. self.parser = sling.api.Parser(self.commons, filename) # Initialize document schema in commons. self.schema = sling.DocumentSchema(self.commons) # Freeze store if it is a private commons store for the parser. if store == None: self.commons.freeze()
def extract_entity_mentions(nq_data, labelled_record): """Parse ourput corpus and create map from tokens to entity ids. Args: nq_data: A python dictionary containint NQ data of 1 train/dev shard labelled_record: Sling output document with labelled paragraphs Returns: nq_data: Original object augmented with entity maps """ recin = sling.RecordReader(labelled_record) commons = sling.Store() docschema = sling.DocumentSchema(commons) commons.freeze() cnt = 1 for key, value in recin: store = sling.Store(commons) doc = sling.Document(store.parse(value), store, docschema) index, ans_type, idx, ans_id = key.decode("utf-8").split("|") cnt += 1 entity_map = {} # Parse entity mentions labelled by sling for m in doc.mentions: e = [i["is"] for i in m.evokes()] if not e: continue if is_sling_entity(e): e_val = e[0]["id"] if m.begin in entity_map: entity_map[m.begin].append((m.end, e_val)) else: entity_map[m.begin] = [(m.end, e_val)] if ans_type == "annotated_long_answer": nq_data[index]["annotations"][int( idx)]["long_answer"]["entity_map"] = entity_map elif ans_type == "question": nq_data[index]["question_entity_map"] = entity_map elif ans_type == "annotated_short_answer": nq_data[index]["annotations"][int(idx)]["short_answers"][int( ans_id)]["entity_map"] = entity_map else: nq_data[index]["long_answer_candidates"][int( idx)]["entity_map"] = entity_map return nq_data
def convert(brat_dir_path: str, output_file_path: str, verbose: bool = False): # load the brat repository repo = RepoModel(brat_dir_path) if verbose: print('Loaded {} document(s) from {}'.format(len(repo.documents), brat_dir_path)) # load the SLING commons store and the document schema commons = load_commons_store() schema = sling.DocumentSchema(commons) commons.freeze() writer = sling.RecordWriter(output_file_path) for document_name in repo.documents: document = repo.documents[document_name] reader = DocReader(document) converter = DocConverter(commons, schema, document_name) converter.convert(reader, writer) writer.close()
def build(self, commons_path, corpora_path): # Prepare lexical dictionaries. self.words = Lexicon(self.words_normalize_digits) self.suffix = Lexicon(self.words_normalize_digits, oov_item=None) # Initialize training corpus. corpora = Corpora(corpora_path, commons_path) # Collect word and affix lexicons. for document in corpora: for token in document.tokens: word = token.word self.words.add(word) for s in self.get_suffixes(word): assert type(s) is str self.suffix.add(s) print "Words:", self.words.size(), "items in lexicon, including OOV" print "Suffix:", self.suffix.size(), "items in lexicon" # Load common store, but not freeze it yet. We will add the action table # and cascade specification to it. self.commons_path = commons_path self.commons = sling.Store() self.commons.load(commons_path) schema = sling.DocumentSchema(self.commons) # Prepare action table and cascade. self._build_action_table(corpora) self.cascade = cascade.ShiftMarkCascade(self.actions) print self.cascade # Save cascade specification in commons. _ = self.cascade.as_frame(self.commons, delegate_cell_prefix="delegate") # Freeze the common store. self.commons.freeze() # Add feature specs. self._specify_features()
def __init__(self, recordio, commons, schema=None, gold=False, loop=False): self.filename = recordio self.commons_owned = False if isinstance(commons, str): self.commons = sling.Store() self.commons.load(commons) self.commons_owned = True else: assert isinstance(commons, sling.Store) self.commons = commons if schema is None or self.commons_owned: schema = sling.DocumentSchema(self.commons) if self.commons_owned: self.commons.freeze() assert schema is not None self.schema = schema self.reader = sling.RecordReader(recordio) self.generator = None self.loop = loop self.generator = None self.set_gold(gold)
flags.define('--' + option, help=option, default=value, type=type(value)) flags.parse() for option, value in flags.arg.__dict__.items(): if option in options.__dict__: options.__dict__[option] = value print("Setting option", option, "to", value) if options.doc_per_sentence: assert options.skip_coref, \ "Per-sentence documents can only be output without coreference." commons = sling.Store() schema = sling.DocumentSchema(commons) if not flags.arg.omit_constituents: assert os.path.exists(flags.arg.constituency_schema) commons.load(flags.arg.constituency_schema) commons.freeze() writer = sling.RecordWriter(flags.arg.output) # Read allowed ids, if provided. allowed_ids = set() if len(options.allowed_ids_file) > 0: with open(options.allowed_ids_file, 'r') as f: for line in f: line = line.strip() if not line.endswith('.gold_conll'):
import logging from typing import NamedTuple, Tuple, List, Iterable from pathlib import Path from tqdm import tqdm import sling from sling.nlp.document import DocumentSchema, Token, Mention LOGGER = logging.getLogger(__name__) ANNOTATED_DIR = Path("/home/hiroakih/tir3/sling/local/data/e/ner/en") # some magic commands commons = sling.Store() DOCSCHEMA = sling.DocumentSchema(commons) commons.freeze() class MyDocument(object): def __init__(self, frame=None, store=None, schema=None, load_tokens=True, load_mentions=True): # Create store, frame, and schema if missing. if frame != None: store = frame.store() if store == None: store = sling.Store() if schema == None: schema = DocumentSchema(store) if frame == None:
def compare(arg): base_reader = sling.RecordReader(arg.base) expt_reader = sling.RecordReader(arg.expt) commons = sling.Store() commons.load(arg.commons) schema = sling.DocumentSchema(commons) commons.freeze() store = sling.Store(commons) index = -1 for (_, base_val), (_, expt_val) in zip(base_reader, expt_reader): index += 1 base_doc = sling.Document(frame=store.parse(base_val), schema=schema) expt_doc = sling.Document(frame=store.parse(expt_val), schema=schema) # Basic checks. base = base_doc.frame["trace"] expt = expt_doc.frame["trace"] if base is None and expt_doc is not None: checker.error('No trace in base document at index %d' % index) elif base is not None and expt_doc is None: checker.error('No trace in expt document at index %d' % index) if base is None: continue # Traces should be over the same token range. checker = Checker(index, base_doc, expt_doc, arg.diff) checker.check_eq(base["begin"], expt["begin"], "Trace Begin") checker.check_eq(base["end"], expt["end"], "Trace End") # Check LSTM features. base_lstm = base["/trace/lstm_features"] expt_lstm = expt["/trace/lstm_features"] checker.check_eq(len(base_lstm), len(expt_lstm), "LSTM Features Length") for i in range(len(base_lstm)): checker.frame_eq(base_lstm[i], expt_lstm[i], \ "LSTM features for token %d (%s)" % (i, base_doc.tokens[i].word)) # Check steps. base_steps = base["/trace/steps"] expt_steps = expt["/trace/steps"] min_steps = min(len(base_steps), len(expt_steps)) for i in range(min_steps): message = "Step %d's current token index" % i checker.check_eq(base_steps[i]["/trace/current"], \ expt_steps[i]["/trace/current"], message) # Check FF features for the step. base_ff = base_steps[i]["/trace/ff_features"] expt_ff = expt_steps[i]["/trace/ff_features"] checker.check_eq(len(base_ff), len(expt_ff), \ "# of FF features for step %d" % i) base_dict = {f["/trace/feature"] : f["/trace/values"] for f in base_ff} expt_dict = {f["/trace/feature"] : f["/trace/values"] for f in expt_ff} for k, v in base_dict.items(): checker.check_eq(k in expt_dict, True, \ "Step %d: FF feature %s not in expt" % (i, k)) checker.check_eq(v, expt_dict[k], \ "Step %d: FF feature %s has a different value in expt" % (i, k)) for k, v in expt_dict.items(): checker.check_eq(k in base_dict, True, \ "Step %d: FF feature %s not in base" % (i, k)) # Check action(s) in the step. base_actions = base_steps[i]["/trace/actions"] expt_actions = expt_steps[i]["/trace/actions"] for idx in range(min(len(base_actions), len(expt_actions))): checker.frame_eq(base_actions[idx]["/trace/predicted"], \ expt_actions[idx]["/trace/predicted"], "Step %d, predicted action %d" % (i, idx), ["/trace/_str"]) checker.frame_eq(base_actions[idx]["/trace/final"], \ expt_actions[idx]["/trace/final"], "Step %d, final action %d" % (i, idx), ["/trace/_str"]) # There should be the same number of actions in the step. checker.check_eq(len(base_actions), len(expt_actions), \ "Step %d: # of actions" % i) # There should be the same number of steps. checker.check_eq(len(base_steps), len(expt_steps), "# of Steps") base_reader.close() expt_reader.close()
def run(self, task): # Get parameters. language = task.param("language") # Load knowledge base. log.info("Load knowledge base") kb = sling.Store() kb.load(task.input("kb").name) n_infobox = kb["/wp/infobox"] n_page_item = kb["/wp/page/item"] n_file = kb["/wp/info/file"] n_media = kb["/wp/media"] image_fields = [ (kb["/wp/info/image"], kb["/wp/info/caption"]), (kb["/wp/info/cover"], kb["/wp/info/caption"]), (kb["/wp/info/logo"], kb["/wp/info/logo_caption"]), (kb["/wp/info/photo"], kb["/wp/info/photo_caption"]), (kb["/wp/info/flag_image"], kb["/wp/info/flag_caption"]), ] p_media = kb["media"] p_id = kb["id"] p_is = kb["is"] p_imported_from = kb["P143"] p_media_legend = kb["P2096"] image_properties = [ kb["P18"], # image kb["P154"], # logo image kb["P41"], # flag image ] lang = kb["/lang/" + language] wikipedia_item = lang["/lang/wikilang/wikipedia"] docschema = sling.DocumentSchema(kb) kb.freeze() # Fetch media titles for Wikipedia from yesterday. log.info("Fetch local media titles") yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d") mediaurl = "https://dumps.wikimedia.org/other/mediatitles/%s/" \ "%swiki-%s-all-media-titles.gz" % (yesterday, language, yesterday) r = urllib.request.urlopen(mediaurl) mediatitles = set(gzip.decompress(r.read()).decode().split('\n')) task.increment("local_media_files", len(mediatitles)) # Open output file. fout = open(task.output("output").name, "w") # Process input articles. for res in task.inputs("input"): log.info("Extract media files from", res.name) for _, data in sling.RecordReader(res.name): # Read article into store. store = sling.Store(kb) doc = store.parse(data) task.increment("documents") # Find first infobox. infobox = None for theme in doc(docschema.document_theme): if theme.isa(n_infobox): infobox = theme break if infobox is None: continue task.increment("infoboxes") # Find images in infobox. imagelist = [] for n_image, n_caption in image_fields: image = infobox[n_image] caption = infobox[n_caption] if image is None: continue # Get image for repeated image field. if type(image) is sling.Frame: group = image image = group[n_file] caption = group[n_caption] if image is None: continue if "{" in image or "[" in image: # Structured annotations. annotations = sling.lex(image, store=store, schema=docschema) for theme in annotations.themes: if theme.isa(n_media): image = theme[p_is] if image is not None: imagelist.append((image, None)) task.increment("structured_annotations") else: # Image filename. imagelist.append((image, caption)) if len(imagelist) == 0: continue # Process list of images for item. known_images = 0 image_frames = [] item = doc[n_page_item] if item is None: continue for image, caption in imagelist: # Disregard direct URLs for now. if image.startswith("http://") or \ image.startswith("https://") or \ image.startswith("//"): task.increment("url_images") continue # Trim image name. Remove File: prefix. colon = image.find(':') if colon > 0 and colon < 10: image = image[colon + 1:] image = titlecase(image.strip()).replace('_', ' ') if len(image) == 0 or image in default_images: task.increment("empty_images") continue if image.endswith("‎"): image = image[:-5] frag = image.find('#') if frag > 0: image = image[:frag] image = html.unescape(image) image = urllib.parse.unquote(image) # Discard media files with unknown or ignored extensions. dot = image.rfind('.') ext = image[dot:].lower() if dot > 0 else None if ext in ignored_extensions: task.increment("ignored_image_format") continue if ext not in known_extensions: log.info("unknown format:", item.id, image) task.increment("unknown_image_format") continue # Get item from KB and check if image is already known. task.increment("images") known = False for prop in image_properties: for img in item(prop): img = kb.resolve(img) if img == image: known = True known_images += 1 if known: task.increment("known_images") continue task.increment("new_images") # Check if image is in local Wikipedia or Wikimedia Commons. fn = image.replace(' ', '_') if fn in mediatitles: urlbase = "https://upload.wikimedia.org/wikipedia/" + language task.increment("local_images") else: urlbase = "https://upload.wikimedia.org/wikipedia/commons" task.increment("commons_images") if known_images == 0: task.increment("commons_imaged_items") # Compute URL for image. md5 = md5hash(fn) fn = fn.replace("?", "%3F") fn = fn.replace("+", "%2B") fn = fn.replace("&", "%26") url = "%s/%s/%s/%s" % (urlbase, md5[0], md5[0:2], fn) # Create frame for item with media image. slots = [ (p_is, url), (p_imported_from, wikipedia_item), ] if caption != None: capdoc = sling.lex(caption, store=store, schema=docschema) captxt = capdoc.phrase(0, len(capdoc.tokens)) slots.append((p_media_legend, captxt)) image_frames.append(store.frame(slots)) # Create item frame with extra image info. if len(image_frames) == 0: continue slots = [(p_id, item.id)] for image_frame in image_frames: slots.append((p_media, image_frame)) frame = store.frame(slots) fout.write(frame.data(utf8=True)) fout.write("\n") if known_images == 0: task.increment("imaged_items") fout.close()
def __init__(self, filename, commons=None): self.input = sling.RecordReader(filename) self.iter = iter(self.input) self.commons = sling.Store() if commons == None else commons self.docschema = sling.DocumentSchema(self.commons) if commons == None: self.commons.freeze()
def from_flow(self, fl): blob = fl.blob("spec") temp_dict = pickle.loads(blob.data) self.__dict__.update(temp_dict) # Read non-pickled fields. # Read common store. self.commons = sling.Store() temp_file = tempfile.NamedTemporaryFile(delete=False) filename = temp_file.name with open(filename, "wb") as f: f.write(fl.blob("commons").data) temp_file.close() self.commons.load(filename) _ = sling.DocumentSchema(self.commons) self.commons.freeze() os.unlink(filename) # Read action table from the commons. self.actions = Actions() self.actions.decode(self.commons["/table"]) # Read cascade specification. This is done by calling eval() # on the class constructor. The classname is stored in the cascade frame. frame = self.commons["/cascade"] self.cascade = eval(frame["name"])(self.actions) print self.cascade # Read word lexicon. blob = fl.blob("lexicon") self.words = Lexicon(self.words_normalize_digits) self.words.read(blob.data.tobytes(), chr(int(blob.get_attr("delimiter")))) print self.words.size(), "words read from flow's lexicon" # Read suffix table. self.suffix = Lexicon(self.words_normalize_digits, oov_item=None) data = fl.blob("suffixes").data def read_int(mview): output = 0 shift_bits = 0 index = 0 while index < len(mview): part = ord(mview[index]) index += 1 output |= (part & 127) << shift_bits shift_bits += 7 if part & 128 == 0: break return output, mview[index:] affix_type, data = read_int(data) # affix type assert affix_type == 1 max_length, data = read_int(data) # max length assert max_length == self.suffixes_max_length num, data = read_int(data) # num affixes for _ in xrange(num): num_bytes, data = read_int(data) word = data[0:num_bytes].tobytes() self.suffix.add(word) data = data[num_bytes:] num_chars, data = read_int(data) if num_chars > 0: shorter_index, data = read_int(data) print self.suffix.size(), "suffixes read from flow's affix table"