def prepare_sling_input_corpus(nq_data, sling_input_corpus): """Parse each paragrapgh in NQ (LA candidate, LA, SA, question). Prepare a sling corpus to do entity linking. Args: nq_data: A python dictionary containint NQ data of 1 train/dev shard sling_input_corpus: A filename string to write the sling format documents into """ corpus = sling.RecordWriter(sling_input_corpus) for i in nq_data.keys(): tokens = nq_data[i]["document_tokens"] if ARGS.annotate_candidates: for idx, la_cand in enumerate( nq_data[i]["long_answer_candidates"]): answer, answer_map, doc = extract_and_tokenize_text( la_cand, tokens) if answer: nq_data[i]["long_answer_candidates"][idx][ "text_answer"] = answer nq_data[i]["long_answer_candidates"][idx][ "answer_map"] = answer_map key = i + "|candidate|" + str(idx) + "|i" corpus.write(key, doc.frame.data(binary=True)) if ARGS.annotate_short_answers: for idx, ann in enumerate(nq_data[i]["annotations"]): short_ans = ann["short_answers"] if not short_ans: continue for sid in range(len(short_ans)): ans = short_ans[sid] answer, answer_map, doc = extract_and_tokenize_text( ans, tokens) if answer: nq_data[i]["annotations"][idx]["short_answers"][sid][ "text_answer"] = answer nq_data[i]["annotations"][idx]["short_answers"][sid][ "answer_map"] = answer_map key = i + "|annotated_short_answer|" + str( idx) + "|" + str(sid) corpus.write(key, doc.frame.data(binary=True)) if ARGS.annotate_long_answers: for idx, ann in enumerate(nq_data[i]["annotations"]): long_ans = ann["long_answer"] answer, answer_map, doc = extract_and_tokenize_text( long_ans, tokens) if answer: nq_data[i]["annotations"][idx]["long_answer"][ "text_answer"] = answer nq_data[i]["annotations"][idx]["long_answer"][ "answer_map"] = answer_map key = i + "|annotated_long_answer|" + str(idx) + "|i" corpus.write(key, doc.frame.data(binary=True)) if ARGS.annotate_question: doc = sling.tokenize(str(nq_data[i]["question_text"])) key = i + "|question|i|i" corpus.write(key, doc.frame.data(binary=True)) corpus.close()
def __init__(self, cats, catid): # Get category from knowledge base. self.cats = cats self.store = sling.Store(cats.commons) self.category = cats.commons[catid] if self.category == None: raise Exception("unknown item") if not cats.is_category(self.category): raise Exception("not a category") # Tokenize category title. # TODO(ringgaard): Use language-depedendent category title. name = self.category.name colon = name.find(':') if colon != -1: name = name[colon + 1:] self.title = name self.doc = sling.tokenize(name, store=self.store) # Read members. self.members = self.store.parse(cats.member_db.lookup(catid)) # Build fact distribution for members. self.facts = {} self.targets = {} self.num_members = 0 for member in self.members(cats.n_item_member): if cats.is_category(member): continue member_facts = self.cats.extractor.facts(self.store, member) member_targets = set() for fact in member_facts: target = fact[-1] increment_key(self.facts, fact) member_targets.add(target) for target in member_targets: increment_key(self.targets, target) self.num_members += 1
def extract_and_tokenize_text(item, tokens): """Extracts the tokens in passage, tokenizes them using sling tokenizer.""" start_token = item["start_token"] end_token = item["end_token"] if start_token >= 0 and end_token >= 0: non_html_tokens = [ x for x in tokens[start_token:end_token] if not x["html_token"] ] answer = " ".join([x["token"] for x in non_html_tokens]) answer_map = [idx for idx, x in enumerate(non_html_tokens)] doc = sling.tokenize(str(answer)) return answer, answer_map, doc return "", [], None
def run(self, task): self.init(task) writer = sling.RecordWriter(task.output("output").name) rejected = sling.RecordWriter(task.output("rejected").name) inputs = [t.name for t in task.inputs("items")] for filename in inputs: reader = sling.RecordReader(filename) for index, (key, value) in enumerate(reader): store = sling.Store(self.kb) frame = store.parse(value) # Only process category items. if not self.is_category(frame): rejected.write(key, "not_category") continue # See if the category should be skipped. members = self.get_members(frame) reject, reason = self.reject(key, frame, members) if reject: task.increment("skipped_categories/" + reason) rejected.write(key, reason) continue # First, collect the targets of all facts of all category members. qp_counts = self.qid_pid_counts(store, members) # Next, tokenize the category title. title = self.get_title(frame) colon = title.find(':') title = title[colon + 1:] document = sling.tokenize(title, store) # Next, find matches for all spans. These are reported as a list, # where ith item = spans that begin at token i (possibly an empty list). begin_to_spans = self.compute_spans(document, qp_counts) # Construct maximal parses with non-overlapping spans. parses = self.construct_parses(begin_to_spans) # Post-process parses. parses = self.post_process(parses) if len(parses) == 0 or len(parses) == 1 and len( parses[0]) == 0: task.increment("skipped_categories/no_parses") rejected.write(key, "no_parses") continue # Write parses as frames. frame = store.frame({"name": title, "members": members}) frame["document"] = document.frame for parse in parses: span_array = store.array(len(parse)) for i, span in enumerate(parse): span_array[i] = store.frame({ "begin": span.begin, "end": span.end, "qid": span.qid, "prior": span.prior, "pids": list(span.pids), "count": span.count }) parse_frame = store.frame({"spans": span_array}) frame.append("parse", parse_frame) writer.write(key, frame.data(binary=True)) task.increment("categories_accepted") # Compute histogram over number of parses. for b in self.num_parses_bins: if len(parses) <= b: task.increment("#parses <= %d" % b) if self.num_parses_bins[-1] < len(parses): task.increment("#parses > %d" % self.num_parses_bins[-1]) reader.close() writer.close() rejected.close()
subevokes.append(subframe) self.evoke(subframe, b, b + l) break # Match subspans. for subitem in subevokes: b, e = self.evokes[subitem] for i in xrange(b, e): self.covered[i] = False self.match(subitem, b, e) while True: text = raw_input("name: ") doc = sling.tokenize(text, store=sling.Store(commons)) name = Name(doc) name.match(None, 0, len(doc.tokens)) print print "Analysis:" print doc.tolex() print phrases = {} for m in doc.mentions: for f in m.evokes(): phrases[f] = doc.phrase(m.begin, m.end) print "Structure:"
] articles = sling.RecordDatabase("data/e/wiki/en/[email protected]") output = sling.RecordWriter("/tmp/chunked.rec") for docid in documentids: # Read document from article database. store = sling.Store(commons) if docid.startswith("Q"): record = articles.lookup(docid) article = store.parse(record) document = sling.Document(article, schema=docschema) document.remove_annotations() document.update() else: document = sling.tokenize(docid, store=store, schema=docschema) print document.frame["title"] begin = 0 while begin < len(document.tokens): # Find next sentence. end = begin + 1 while end < len(document.tokens) and \ document.tokens[end].brk < sling.SENTENCE_BREAK: end += 1 print "s:", document.phrase(begin, end) length = end - begin # Find punctuations and case forms. punct = [False] * length