def find_date_mentions(self) -> Iterable: for n, (doc_wid, doc_raw) in enumerate(self.corpus.input): doc_wid = str(doc_wid, 'utf-8') store = sling.Store(self.commons) frame = store.parse(doc_raw) document = sling.Document(frame, store, self.docschema) doc_title = get_metadata(frame)[1] if len(document.tokens) == 0: continue sorted_mentions = sorted(document.mentions, key=lambda m: m.begin) postion2time: Dict[int, int] = {} for ii, mention in enumerate(sorted_mentions): linked_entity = self.get_linked_entity(mention) if type(linked_entity) is not int: continue for i in range(mention.begin, mention.end): postion2time[i] = linked_entity colored_tokens: List[str] = [] for i, tok in enumerate(document.tokens): if i in postion2time: colored_tokens.append( colored('{}:{}'.format(tok.word, postion2time[i]), 'green')) else: if tok.word.isnumeric(): colored_tokens.append(colored(tok.word, 'red')) else: colored_tokens.append(tok.word) colored_text = ' '.join(colored_tokens) yield doc_wid, doc_title, colored_text
def find_all_mentions(self): for n, (doc_wid, doc_raw) in enumerate(self.corpus.input): doc_wid = str(doc_wid, 'utf-8') store = sling.Store(self.commons) frame = store.parse(doc_raw) document = sling.Document(frame, store, self.docschema) doc_title = get_metadata(frame)[1] if len(document.tokens) == 0: continue sorted_mentions = sorted(document.mentions, key=lambda m: m.begin) all_mentions: List[Tuple[int, int, Any]] = [] for ii, mention in enumerate(sorted_mentions): linked_entity = self.get_linked_entity(mention) all_mentions.append( (mention.begin, mention.end, linked_entity)) tokens = [t.word for t in document.tokens] prev_e = 0 colored_tokens: List[str] = [] for s, e, wid in all_mentions: colored_tokens.append(' '.join(tokens[prev_e:s])) colored_tokens.append( colored('{}||{}'.format(' '.join(tokens[s:e]), wid), 'green')) prev_e = e colored_text = ' '.join(colored_tokens) yield doc_wid, doc_title, colored_text
def _end_document(self, callback=None): if len(self.tokens.spans) == 0: return # Save SRL annotations for the last sentence. self._save_srl_annotations() # Add tokens as leaf constituents. self._add_token_constituents() # Find heads of all constituents. for node in self.constituents.spans: self.converter.head_finder.find(node) # Generate input statistics. self._summarize_input() # Sanity check: all annotations should be complete. assert len(self.current_srl) == 0, self.current_srl assert self.ner.all_ended() assert self.coref.all_ended() assert self.constituents.all_ended() for s in self.srl: assert s.all_ended() # Write the SLING document and invoke the callback. if callback is not None: store = sling.Store(self.converter.commons) document = sling.Document(None, store, self.converter.schema) self.write(document) callback(document)
def end_document(): global document, brk, begin if document is not None: end_span() document.update() fout.write(None, document.frame.data(binary=True)) document = sling.Document(store=store) brk = sling.NO_BREAK begin = None kind = None
def lex(text, store=None, schema=None): # Initialize tokenizer if needed. global tokenizer if tokenizer == None: tokenizer = sling.api.Tokenizer() # Create store for document if needed. if store == None: store = sling.Store() # Parse LEX-encoded text. frame = tokenizer.lex(store, text) # Return document with annotations. return sling.Document(frame, store, schema)
def tokenize(text, store=None, schema=None): # Initialize tokenizer if needed. global tokenizer if tokenizer == None: tokenizer = sling.api.Tokenizer() # Create store for document if needed. if store == None: store = sling.Store() # Tokenize text. frame = tokenizer.tokenize(store, text) # Return document with tokens. return sling.Document(frame, store, schema)
def extract_entity_mentions(nq_data, labelled_record): """Parse ourput corpus and create map from tokens to entity ids. Args: nq_data: A python dictionary containint NQ data of 1 train/dev shard labelled_record: Sling output document with labelled paragraphs Returns: nq_data: Original object augmented with entity maps """ recin = sling.RecordReader(labelled_record) commons = sling.Store() docschema = sling.DocumentSchema(commons) commons.freeze() cnt = 1 for key, value in recin: store = sling.Store(commons) doc = sling.Document(store.parse(value), store, docschema) index, ans_type, idx, ans_id = key.decode("utf-8").split("|") cnt += 1 entity_map = {} # Parse entity mentions labelled by sling for m in doc.mentions: e = [i["is"] for i in m.evokes()] if not e: continue if is_sling_entity(e): e_val = e[0]["id"] if m.begin in entity_map: entity_map[m.begin].append((m.end, e_val)) else: entity_map[m.begin] = [(m.end, e_val)] if ans_type == "annotated_long_answer": nq_data[index]["annotations"][int( idx)]["long_answer"]["entity_map"] = entity_map elif ans_type == "question": nq_data[index]["question_entity_map"] = entity_map elif ans_type == "annotated_short_answer": nq_data[index]["annotations"][int(idx)]["short_answers"][int( ans_id)]["entity_map"] = entity_map else: nq_data[index]["long_answer_candidates"][int( idx)]["entity_map"] = entity_map return nq_data
def run(self, task): self.init(task) max_parses = int(task.param("max_parses")) reader = sling.RecordReader(task.input("input").name) writer = sling.RecordWriter(task.output("output").name) for index, (key, value) in enumerate(reader): store = sling.Store(self.kb) category = store.parse(value) document = sling.Document(category.document) # Score each parse. parse_with_score = self.score(category) # Keep only the top-k parses. ranked_parses = sorted(parse_with_score, key=lambda x: -x[1]) if len(ranked_parses) > max_parses: dropped = len(ranked_parses) - max_parses ranked_parses = ranked_parses[0:max_parses] task.increment("parses-dropped", dropped) task.increment("categories-with-too-many-parses") # Compute signature for each parse and store it in the parse. for parse, _ in ranked_parses: tokens, span_signature = self.signature(document, parse) parse["signature"] = tokens for span in parse.spans: if span in span_signature: span["signature"] = span_signature[span] # Also compute the coarse signature. tokens, span_signature = self.signature(document, parse, coarse=True) parse["coarse_signature"] = tokens for span in parse.spans: if span in span_signature: span["coarse_signature"] = span_signature[span] # Replace the current set of parses with the ranked list. del category["parse"] for parse, _ in ranked_parses: category.append("parse", parse) task.increment("parses-kept", len(ranked_parses)) writer.write(key, category.data(binary=True)) reader.close() writer.close()
def iter_mentions_position( self, wid_set: Set[str] = None ) -> Iterable[Tuple[str, Dict[str, List[int]]]]: for n, (doc_wid, doc_raw) in enumerate(self.corpus.input): doc_wid = str(doc_wid, 'utf-8') if wid_set is not None and doc_wid not in wid_set: continue store = sling.Store(self.commons) frame = store.parse(doc_raw) document = sling.Document(frame, store, self.docschema) sorted_mentions = sorted(document.mentions, key=lambda m: m.begin) m2pos: Dict[str, List[int]] = defaultdict(list) for ii, mention in enumerate(sorted_mentions): linked_entity = self.get_linked_entity(mention) m2pos[linked_entity].append(mention.begin) yield (doc_wid, m2pos)
def iter_mentions(self, wid_set: Set[str]=None, only_entity: bool=False, split_by: str=None) -> \ Iterable[Tuple[str, sling.Document, List[Tuple[str, int, int]]]]: assert split_by in {'sentence', None}, 'not supported split_by' split_by = {'sentence': 3, None: None}[split_by] for n, (doc_wid, doc_raw) in enumerate(self.corpus.input): doc_wid = str(doc_wid, 'utf-8') if wid_set is not None and doc_wid not in wid_set: continue store = sling.Store(self.commons) frame = store.parse(doc_raw) document = sling.Document(frame, store, self.docschema) sorted_mentions = sorted(document.mentions, key=lambda m: m.begin) tokens = [t.word for t in document.tokens] split_start = [0] + [ i for i, t in enumerate(document.tokens) if t.brk == split_by ] split_ind = 0 mentions: List[Tuple[str, int, int]] = [] for mention in sorted_mentions: while len(split_start ) > split_ind + 1 and mention.begin >= split_start[ split_ind + 1]: if len(mentions) > 0: yield ( doc_wid, tokens[split_start[split_ind]:split_start[split_ind + 1]], mentions) mentions = [] split_ind += 1 if len(split_start ) > split_ind + 1 and mention.end > split_start[ split_ind + 1]: # skip mentions beyond the boundary continue linked_entity = self.get_linked_entity(mention) if only_entity and (type(linked_entity) is not str or not linked_entity.startswith('Q')): continue mentions.append( (linked_entity, mention.begin - split_start[split_ind], mention.end - split_start[split_ind])) if len(mentions) > 0: yield (doc_wid, tokens[split_start[split_ind]:], mentions)
def build(recordio_filenames, output_filename, text=False): commons = sling.Store() schema = sling.DocumentSchema(commons) commons.freeze() symbol_names = {} symbol_names["thing"] = 1 # Adds handle's id to 'symbol_names' if it is already not in 'commons'. def add(handle): if type(handle) is not sling.Frame or handle.id is None: return id_str = str(handle.id) if commons[id_str] is not None: return if id_str not in symbol_names: symbol_names[id_str] = 0 symbol_names[id_str] += 1 for filename in recordio_filenames: reader = sling.RecordReader(filename) for key, value in reader: store = sling.Store(commons) document = sling.Document(store.parse(value), schema=schema) for mention in document.mentions: for frame in mention.evokes(): for slot_role, slot_value in frame: add(slot_role) add(slot_value) for theme in document.themes: for slot_role, slot_value in theme: add(slot_role) add(slot_value) output = sling.Store() schema = sling.DocumentSchema(output) for name, count in symbol_names.iteritems(): output.frame({"id": name}) output.freeze() output.save(output_filename, binary=not text) return output, symbol_names
def parse(self, obj): if type(obj) is sling.Document: # Parser document. obj.update() self.parser.parse(obj.frame) obj.refresh_annotations() return obj elif type(obj) is sling.Frame: # Parse document frame and return parsed document. self.parser.parse(obj) return sling.Document(obj) else: # Create local store for new document. store = sling.Store(self.commons) # Tokenize text. doc = tokenize(str(obj), store=store, schema=self.schema) # Parse document. self.parser.parse(doc.frame) doc.refresh_annotations() return doc
def link_documents(self, N=None, out_file="/tmp/linked.rec", add_negatives=False, filter_subjects=None): """ Load n documents and link them to facts """ start = time.time() fout = open(out_file, "w") for n, (doc_id, doc_raw) in enumerate(self.corpus.input): if n == N: break if n % 1000 == 0: print("processed", n, "items in %.1f" % (time.time() - start), "seconds") # get kb items doc_id = str(doc_id, "utf-8") if filter_subjects is not None and doc_id not in filter_subjects: continue kb_item = self.kb[doc_id] tail_entities = {} all_properties = [] for prop, tail in kb_item: tup = self.get_canonical_property(prop, tail) if tup is None: tup = self.get_date_property(prop, tail) if tup is None: continue tail_entities[tup[1]] = tup[0] all_properties.append(tup[0]) store = sling.Store(self.commons) document = sling.Document(store.parse(doc_raw), store, self.docschema) if len(document.tokens) == 0: print("Skipping %s No tokens." % (doc_id)) continue # build token maps tok_to_sent_id, tok_to_para_id, sent_to_span, para_to_span = {}, {}, {}, {} tok_to_char_offset = {} offset = 0 sent_begin = para_begin = 0 sent_id = para_id = 0 for ii, token in enumerate(document.tokens): if ii > 0 and token.brk == 4: para_to_span[para_id] = (para_begin, ii) sent_to_span[sent_id] = (sent_begin, ii) para_id += 1 sent_id += 1 sent_begin = para_begin = ii elif ii > 0 and token.brk == 3: sent_to_span[sent_id] = (sent_begin, ii) sent_id += 1 sent_begin = ii tok_to_sent_id[ii] = sent_id tok_to_para_id[ii] = para_id tok_to_char_offset[ii] = offset offset += len(token.word) + 1 para_to_span[para_id] = (para_begin, ii + 1) sent_to_span[sent_id] = (sent_begin, ii + 1) # find subjects sent_to_subj, para_to_subj = defaultdict(list), defaultdict(list) mentid_to_linked_entity = {} sorted_mentions = sorted(document.mentions, key=lambda m: m.begin) for ii, mention in enumerate(sorted_mentions): if tok_to_sent_id[mention.begin] != tok_to_sent_id[mention.end - 1]: continue linked_entity = self.get_linked_entity(mention) mentid_to_linked_entity[ii] = linked_entity if linked_entity == doc_id: sent_id = tok_to_sent_id[mention.begin] sent_to_subj[sent_id].append(mention) para_id = tok_to_para_id[mention.begin] para_to_subj[para_id].append(mention) # find tails relations = [] seen_properties = {} for ii, mention in enumerate(sorted_mentions): # first look for sentence matches linked_entity = mentid_to_linked_entity[ii] if linked_entity == doc_id: continue if linked_entity in tail_entities: if tail_entities[linked_entity] in seen_properties: continue my_sent = tok_to_sent_id[mention.begin] if my_sent in sent_to_subj: my_para = tok_to_para_id[mention.begin] para_span = para_to_span[my_para] #sent_span = sent_to_span[my_sent] fout.write( self.serialize_relation( document, tok_to_char_offset, para_span, doc_id, sent_to_subj[my_sent], tail_entities[linked_entity], linked_entity, mention, "sentence") + "\n") seen_properties[tail_entities[linked_entity]] = my_para self.relation_stats["sentences"][ tail_entities[linked_entity]] += 1 for ii, mention in enumerate(sorted_mentions): # next look for paragraph matches linked_entity = mentid_to_linked_entity[ii] if linked_entity == doc_id: continue if linked_entity in tail_entities: if tail_entities[linked_entity] in seen_properties: continue my_para = tok_to_para_id[mention.begin] if my_para in para_to_subj: para_span = para_to_span[my_para] fout.write( self.serialize_relation( document, tok_to_char_offset, para_span, doc_id, para_to_subj[my_para], tail_entities[linked_entity], linked_entity, mention, "paragraph") + "\n") seen_properties[tail_entities[linked_entity]] = my_para self.relation_stats["paragraphs"][ tail_entities[linked_entity]] += 1 # add negatives if add_negatives: max_neg = len(seen_properties) num_neg = 0 all_para_id = list(para_to_subj.keys()) if not all_para_id: continue for tail, prop in tail_entities.items(): if num_neg == max_neg: break if prop in seen_properties: continue random_para_id = random.choice(all_para_id) random_para_span = para_to_span[random_para_id] fout.write( self.serialize_relation( document, tok_to_char_offset, random_para_span, doc_id, para_to_subj[random_para_id], prop, None, None, "entity negative") + "\n") num_neg += 1 seen_properties[prop] = None self.relation_stats["entity negatives"][prop] += 1 fout.close() print("Sentences -- Total ", sum(self.relation_stats["sentences"].values())) print(" :: ".join( "%s:%d" % (k, v) for k, v in self.relation_stats["sentences"].items())) print("Paragraphs -- Total ", sum(self.relation_stats["paragraphs"].values())) print(" :: ".join( "%s:%d" % (k, v) for k, v in self.relation_stats["paragraphs"].items()))
def run(self): month = "(" + "|".join(self.months.keys()) + ")" day = "(\d{1,2})" year = "(\d{4})" date = "(?:(?:" + day + " " + month + " " + year + ")|" date += "(?:" + month + " " + day + ", " + year + "))" date += "(?:[^)]+?)?" dates = date + u"\s*-+\s*" + date dates = u"(?:(?:(?:born|b\.|n\xe9e),? ([^0-9)]*?)" + date + \ "(?:(?:died|d\.),? [^0-9)]*?" + date + ")?)|(?:" + dates + "))" pat = "(?:[^(]|\([^0-9]*\))*?\([^0-9)]*?" + dates + "\s*\)" rec = re.compile(pat) self.out_file = "local/data/e/wikibot/birth-death-dates.rec" record_file = sling.RecordWriter(self.out_file) records = 0 store = sling.Store(self.kb) for i in range(10): i_file = "local/data/e/wiki/en/documents-0000" + str( i) + "-of-00010.rec" print i_file, records for (item_id, record) in sling.RecordReader(i_file): item = self.kb[item_id] if self.human not in item(self.instanceof): continue if self.precise_date(item(self.date_of_birth)) and \ self.precise_date(item(self.date_of_death)): continue parsed_record = sling.Store().parse(record) doc = sling.Document(parsed_record) raw_text = parsed_record['text'] if len(raw_text) == 0: continue start_index = raw_text.find("<b>") + len("<b>") first = 1 while first < len(doc.tokens) and \ doc.tokens[first].start <= start_index: first += 1 last = first while last < len(doc.tokens) and doc.tokens[last].brk < 3: last += 1 text = doc.phrase(max(0, first - 1), min(len(doc.tokens), last + 15)) m = rec.match(text) if m is None: continue if text.find("(baptised") >= 0 or text.find("throne") >= 0: continue if text.find("(baptized") >= 0 or text.find("partner") >= 0: continue if m.group(2) or m.group(5): first = self.date_from_match(1, m) if first.year < 1753: continue # possibly Julian calendar date if m.group(8) or m.group(11): second = self.date_from_match(7, m) if second.year < 1753: continue # possibly Julian calendar date facts = store.frame({ self.date_of_birth: first.value(), self.date_of_death: second.value() }) else: # Only one date match mg1 = m.group(1) dob = item(self.date_of_birth) dod = item(self.date_of_death) if mg1 and max(mg1.find("died"), mg1.find("d.")) >= 0: # death date only if self.precise_date(dod): continue if self.same_year(first.year, dob): continue # b&d too close facts = store.frame({ self.date_of_death: first.value(), }) else: # birth date only if self.precise_date(dob): continue if self.same_year(first.year, dod): continue # b&d too close facts = store.frame({ self.date_of_birth: first.value(), }) else: first = self.date_from_match(13, m) second = self.date_from_match(19, m) if min(first.year, second.year) < 1753: continue # possibly Julian facts = store.frame({ self.date_of_birth: first.value(), self.date_of_death: second.value() }) records += 1 provenance = store.frame({ self.url: parsed_record['url'], self.method: "English Wikipedia dates for '" + str(item.name) + "'" }) fact = store.frame({ self.item: item, self.facts: facts, self.provenance: provenance }) record_file.write(item.id, fact.data(binary=True)) record_file.close() print records, "birth/death date records written to file:", self.out_file
def compare(arg): base_reader = sling.RecordReader(arg.base) expt_reader = sling.RecordReader(arg.expt) commons = sling.Store() commons.load(arg.commons) schema = sling.DocumentSchema(commons) commons.freeze() store = sling.Store(commons) index = -1 for (_, base_val), (_, expt_val) in zip(base_reader, expt_reader): index += 1 base_doc = sling.Document(frame=store.parse(base_val), schema=schema) expt_doc = sling.Document(frame=store.parse(expt_val), schema=schema) # Basic checks. base = base_doc.frame["trace"] expt = expt_doc.frame["trace"] if base is None and expt_doc is not None: checker.error('No trace in base document at index %d' % index) elif base is not None and expt_doc is None: checker.error('No trace in expt document at index %d' % index) if base is None: continue # Traces should be over the same token range. checker = Checker(index, base_doc, expt_doc, arg.diff) checker.check_eq(base["begin"], expt["begin"], "Trace Begin") checker.check_eq(base["end"], expt["end"], "Trace End") # Check LSTM features. base_lstm = base["/trace/lstm_features"] expt_lstm = expt["/trace/lstm_features"] checker.check_eq(len(base_lstm), len(expt_lstm), "LSTM Features Length") for i in range(len(base_lstm)): checker.frame_eq(base_lstm[i], expt_lstm[i], \ "LSTM features for token %d (%s)" % (i, base_doc.tokens[i].word)) # Check steps. base_steps = base["/trace/steps"] expt_steps = expt["/trace/steps"] min_steps = min(len(base_steps), len(expt_steps)) for i in range(min_steps): message = "Step %d's current token index" % i checker.check_eq(base_steps[i]["/trace/current"], \ expt_steps[i]["/trace/current"], message) # Check FF features for the step. base_ff = base_steps[i]["/trace/ff_features"] expt_ff = expt_steps[i]["/trace/ff_features"] checker.check_eq(len(base_ff), len(expt_ff), \ "# of FF features for step %d" % i) base_dict = {f["/trace/feature"] : f["/trace/values"] for f in base_ff} expt_dict = {f["/trace/feature"] : f["/trace/values"] for f in expt_ff} for k, v in base_dict.items(): checker.check_eq(k in expt_dict, True, \ "Step %d: FF feature %s not in expt" % (i, k)) checker.check_eq(v, expt_dict[k], \ "Step %d: FF feature %s has a different value in expt" % (i, k)) for k, v in expt_dict.items(): checker.check_eq(k in base_dict, True, \ "Step %d: FF feature %s not in base" % (i, k)) # Check action(s) in the step. base_actions = base_steps[i]["/trace/actions"] expt_actions = expt_steps[i]["/trace/actions"] for idx in range(min(len(base_actions), len(expt_actions))): checker.frame_eq(base_actions[idx]["/trace/predicted"], \ expt_actions[idx]["/trace/predicted"], "Step %d, predicted action %d" % (i, idx), ["/trace/_str"]) checker.frame_eq(base_actions[idx]["/trace/final"], \ expt_actions[idx]["/trace/final"], "Step %d, final action %d" % (i, idx), ["/trace/_str"]) # There should be the same number of actions in the step. checker.check_eq(len(base_actions), len(expt_actions), \ "Step %d: # of actions" % i) # There should be the same number of steps. checker.check_eq(len(base_steps), len(expt_steps), "# of Steps") base_reader.close() expt_reader.close()
def handle_category(self, qid, form): def is_on(name): return form.getvalue("main_form_" + name) == "on" # Various options. show_span_qid = is_on("show_span_qid") show_prelim_parse_scores = is_on("show_prelim_parse_scores") show_span_scores = is_on("show_span_scores") show_fact_matches = is_on("show_fact_matching_statistics") show_span_fact_matches = is_on("show_span_fact_match_stats") show_similar_categories = is_on("show_similar_categories") signature_type = form.getvalue("main_form_signature_type") metric = form.getvalue("main_form_sort_metric") fact_weights = self.fact_match_weights(form) frame = browser_globals.category_frame[qid] document = sling.Document(frame=frame.document) num = len([p for p in frame("parse")]) self._tag("div", "<b>%s = %s</b>: %d members, %d parses" % \ (qid, frame.name, len(frame.members), num)) self._br() # Write the parses in a tabular format. show_prelim_parse_scores &= metric != "prelim_parse_score" self.write_main_table_header( "Signature", [t.word for t in document.tokens], "Metric", "Prelim Scores" if show_prelim_parse_scores else None, [t.name for t in FactMatchType] if show_fact_matches else None, "Matching Categories" if show_similar_categories else None) # Each parse is written as one row. parses = [(parse, self.parse_score(frame, parse, metric, fact_weights)) \ for parse in frame("parse")] parses.sort(key=lambda x: -x[1]) for parse, metric_value in parses: signature = util.parse_signature(parse, signature_type) self._begin("tr") self._begin("td") self._form_anchor(signature, signature) self._end("td") self._separator(header=False) prev_span_end = -1 for span in parse.spans: for index in xrange(prev_span_end + 1, span.begin): self._empty_cell() self._begin("td", colspan=span.end-span.begin, align='middle') text = util.span_signature(span, signature_type) if show_span_qid: text += " (" + str(span.qid) + ")" title = '.'.join([str(p) for p in span.pids]) + ' = ' + str(span.qid) if "name" in span.qid: title += " (" + span.qid[name] + ")" self._tag("span", text, title=title) if show_span_scores and "prior" in span: self._br() self._text("%s = %0.4f" % ("prior", span.prior)) if show_span_fact_matches: local_counts = util.fact_matches_for_span(span) self._br() self._begin("table class='span_fact_match'") self._begin("thead") for t in FactMatchType: self._tag("th", t.name) self._end("thead") self._begin("tr") self.write_fact_match_counts(local_counts) self._end(["tr", "table"]) self._end("td") prev_span_end = span.end - 1 for index in xrange(prev_span_end + 1, len(document.tokens)): self._empty_cell() self._separator(header=False) if type(metric_value) is int: self._cell(metric_value) else: self._cell("%.4f" % metric_value) if show_prelim_parse_scores: self._separator(header=False) self._begin("td class='numeric'") for score_type in ["prior", "member_score", "cover"]: if score_type in parse: self._text("%s = %0.4f" % (score_type, parse[score_type])) self._br() if "score" in parse: self._color_text("Overall = %0.4f" % parse.score, "blue") self._end("td") if show_fact_matches: self._separator(header=False) total_fact_counts = util.fact_matches_for_parse(parse) self.write_fact_match_counts(total_fact_counts) if show_similar_categories: self._separator(header=False) self._begin("td") limit = 5 signature_mapping = browser_globals.full_signature_to_parse if signature_type == "coarse": signature_mapping = browser_globals.coarse_signature_to_parse seen = set() for (other_qid, other_category, other_parse) in \ signature_mapping[signature]: if len(seen) >= limit: break if other_qid != qid and other_qid not in seen: seen.add(other_qid) self._text(other_category.name) self._form_anchor(" (= %s)" % other_qid, other_qid) self._text(" (%0.4f)" % other_parse.score) self._br() self._end("td") self._end("tr") self._end("table")
def next(self): _, data = self.input.next() f = sling.Store(self.commons).parse(data) return sling.Document(f, schema=self.docschema)
def __getitem__(self, key): data = self.input.lookup(key) f = sling.Store(self.commons).parse(data) return sling.Document(f, schema=self.docschema)
def __init__(self, commons_store: sling.Store, schema: sling.DocumentSchema, doc_name: str): self.store = sling.Store(commons_store) self.schema = schema self.doc_name = doc_name self.doc = sling.Document(None, self.store, self.schema)
def link_documents(self, max_n=None, fact_out_file="/tmp/facts.json", qry_out_file="/tmp/queries.json", para_out_file="/tmp/paragraphs.json", filter_subjects=None, exclude_subjects=None): """Load n documents and link them to facts.""" start = time.time() fout = open(fact_out_file, "w") fq_out, fp_out = open(qry_out_file, "w"), open(para_out_file, "w") seen_articles = set() total_paras = 0 for n, (doc_id, doc_raw) in enumerate(self.corpus.input): doc_id = str(doc_id, "utf-8") if n % 1000 == 0: print("processed", n, "items in %.1f" % (time.time() - start), "sec") if max_n is not None and random.uniform( 0, 1) > (float(max_n) / 550000): continue if filter_subjects is not None and doc_id not in filter_subjects: continue if exclude_subjects is not None and doc_id in exclude_subjects: continue # get kb items seen_articles.add(doc_id) kb_item = self.kb[doc_id] tail_entities = {} all_properties = [] for prop, tail in kb_item: tup = self.get_canonical_property(prop, tail) if tup is None: tup = self.get_date_property(prop, tail) if tup is None: continue tail_entities[tup[1]] = tup[0] all_properties.append(tup[0]) store = sling.Store(self.commons) document = sling.Document(store.parse(doc_raw), store, self.docschema) if not document.tokens: print("Skipping %s No tokens." % (doc_id)) continue # build token maps tok_to_sent_id, tok_to_para_id, sent_to_span, para_to_span = {}, {}, {}, {} tok_to_char_offset = {} offset = 0 sent_begin = para_begin = 0 sent_id = para_id = 0 for ii, token in enumerate(document.tokens): if ii > 0 and token.brk == 4: para_to_span[para_id] = (para_begin, ii) sent_to_span[sent_id] = (sent_begin, ii) para_id += 1 sent_id += 1 sent_begin = para_begin = ii elif ii > 0 and token.brk == 3: sent_to_span[sent_id] = (sent_begin, ii) sent_id += 1 sent_begin = ii tok_to_sent_id[ii] = sent_id tok_to_para_id[ii] = para_id tok_to_char_offset[ii] = offset offset += len(token.word) + 1 para_to_span[para_id] = (para_begin, len(document.tokens)) sent_to_span[sent_id] = (sent_begin, len(document.tokens)) # find subjects sent_to_subj, para_to_subj = (collections.defaultdict(list), collections.defaultdict(list)) para_to_ment = collections.defaultdict(list) ment_to_para_index = {} mentid_to_linked_entity = {} sorted_mentions = sorted(document.mentions, key=lambda m: m.begin) for ii, mention in enumerate(sorted_mentions): if tok_to_sent_id[mention.begin] != tok_to_sent_id[mention.end - 1]: continue linked_entity = self.get_linked_entity(mention) mentid_to_linked_entity[ii] = linked_entity para_id = tok_to_para_id[mention.begin] para_to_ment[para_id].append((mention, linked_entity)) ment_to_para_index[ii] = len(para_to_ment[para_id]) - 1 if linked_entity == doc_id: sent_id = tok_to_sent_id[mention.begin] sent_to_subj[sent_id].append(ii) para_to_subj[para_id].append(ii) # save paragraphs local_to_global_para = {} for para_id, para_span in para_to_span.items(): if para_span[1] - para_span[0] < MIN_LEN: continue if len(para_to_ment[para_id]) <= 1: continue local_to_global_para[para_id] = total_paras fp_out.write( self.serialize_para(document, tok_to_char_offset, para_span, para_to_ment[para_id], doc_id, total_paras) + "\n") total_paras += 1 # find tails seen_properties = {} for ii, mention in enumerate(sorted_mentions): # first look for sentence matches linked_entity = mentid_to_linked_entity[ii] if linked_entity == doc_id: continue if linked_entity in tail_entities: if tail_entities[linked_entity] in seen_properties: continue my_sent = tok_to_sent_id[mention.begin] my_para = tok_to_para_id[mention.begin] para_span = para_to_span[my_para] if my_para not in local_to_global_para: continue if my_sent in sent_to_subj: # sent_span = sent_to_span[my_sent] fq_out.write( self.serialize_query(local_to_global_para[my_para], ment_to_para_index, doc_id, sent_to_subj[my_sent], tail_entities[linked_entity], linked_entity, ii, "sentence") + "\n") subj_mentions = [ para_to_ment[my_para][ment_to_para_index[mm]][0] for mm in sent_to_subj[my_sent] ] fout.write( self.serialize_fact( document, tok_to_char_offset, para_span, doc_id, subj_mentions, tail_entities[linked_entity], linked_entity, mention, "sentence") + "\n") seen_properties[tail_entities[linked_entity]] = my_para self.relation_stats["sentences"][ tail_entities[linked_entity]] += 1 for ii, mention in enumerate(sorted_mentions): # next look for paragraph matches linked_entity = mentid_to_linked_entity[ii] if linked_entity == doc_id: continue if linked_entity in tail_entities: if tail_entities[linked_entity] in seen_properties: continue my_para = tok_to_para_id[mention.begin] para_span = para_to_span[my_para] if my_para not in local_to_global_para: continue if my_para in para_to_subj: fq_out.write( self.serialize_query(local_to_global_para[my_para], ment_to_para_index, doc_id, para_to_subj[my_para], tail_entities[linked_entity], linked_entity, ii, "paragraph") + "\n") subj_mentions = [ para_to_ment[my_para][ment_to_para_index[mm]][0] for mm in para_to_subj[my_para] ] fout.write( self.serialize_fact( document, tok_to_char_offset, para_span, doc_id, subj_mentions, tail_entities[linked_entity], linked_entity, mention, "paragraph") + "\n") seen_properties[tail_entities[linked_entity]] = my_para self.relation_stats["paragraphs"][ tail_entities[linked_entity]] += 1 # add negatives max_neg = len(seen_properties) num_neg = 0 all_para_id = list(para_to_subj.keys()) if not all_para_id: continue for tail, prop in tail_entities.items(): if num_neg == max_neg: break if prop in seen_properties: continue random_para_id = random.choice(all_para_id) random_para_span = para_to_span[random_para_id] subj_mentions = [ para_to_ment[random_para_id][ment_to_para_index[mm]][0] for mm in para_to_subj[random_para_id] ] fout.write( self.serialize_fact( document, tok_to_char_offset, random_para_span, doc_id, subj_mentions, prop, None, None, "entity negative") + "\n") num_neg += 1 seen_properties[prop] = None self.relation_stats["entity negatives"][prop] += 1 fout.close() fq_out.close() fp_out.close() print("Sentences -- ", sum(self.relation_stats["sentences"].values())) print(" :: ".join( "%s:%d" % (k, v) for k, v in self.relation_stats["sentences"].items())) print("Paragraphs -- ", sum(self.relation_stats["paragraphs"].values())) print(" :: ".join( "%s:%d" % (k, v) for k, v in self.relation_stats["paragraphs"].items()))
'Q57652', # Helle Thorning-Schmidt 'Q1636974', # Danske Bank 'Q186285', # University of Copenhagen 'Q1687170', # Jens Christian Skou ] articles = sling.RecordDatabase("data/e/wiki/en/[email protected]") output = sling.RecordWriter("/tmp/chunked.rec") for docid in documentids: # Read document from article database. store = sling.Store(commons) if docid.startswith("Q"): record = articles.lookup(docid) article = store.parse(record) document = sling.Document(article, schema=docschema) document.remove_annotations() document.update() else: document = sling.tokenize(docid, store=store, schema=docschema) print document.frame["title"] begin = 0 while begin < len(document.tokens): # Find next sentence. end = begin + 1 while end < len(document.tokens) and \ document.tokens[end].brk < sling.SENTENCE_BREAK: end += 1 print "s:", document.phrase(begin, end)