def upload_task(self, passage): passage_out = self.create_passage(text=to_text(passage, sentences=False)[0], type="PUBLIC", source=self.source) task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user, passage=passage_out, manager_comment=passage.ID, user_comment=passage.ID, parent=None, is_demo=False, is_active=True) tok_task_out = self.create_tokenization_task(**task_in) tok_user_task_in = dict(tok_task_out) tok_user_task_in.update( to_json(passage, return_dict=True, tok_task=True)) tok_user_task_out = self.submit_tokenization_task(**tok_user_task_in) task_in.update(parent=tok_task_out, type="ANNOTATION") ann_user_task_in = self.create_annotation_task(**task_in) ann_user_task_in.update( to_json(passage, return_dict=True, tok_task=tok_user_task_out, all_categories=self.layer["categories"])) return self.submit_annotation_task(**ann_user_task_in)
def __init__(self, enabled=True): self.address = os.environ.get("SPOTLIGHT_ADDRESS", "http://model.dbpedia-spotlight.org/en/annotate") self.confidence = float(os.environ.get("SPOTLIGHT_CONFIDENCE", 0.3)) self.text = None self.spots = () self.passage_texts = keydefaultdict(lambda passage: to_text(passage, sentences=False)[0]) self.enabled = enabled
def test_parser(config, model_type, formats, default_setting, text=True): filename = "test_files/models/%s_%s%s" % ("_".join(formats), model_type, default_setting.suffix()) remove_existing(filename) config.update(default_setting.dict()) scores = [] params = [] passages = list(map(load_passage, passage_files(*formats))) evaluate = ("amr" not in formats) for mode in "train", "load": print("-- %sing %s" % (mode, model_type)) config.update(dict(classifier=model_type, copy_shared=None)) p = Parser(model_files=filename, config=config) p.save_init = True list( p.train(passages if mode == "train" else None, dev=passages, test=True, iterations=2)) assert p.model.is_finalized, "Model should be finalized after %sing" % mode assert not getattr(p.model.feature_extractor, "node_dropout", 0), p.model.feature_extractor.node_dropout all_params = p.model.all_params() params.append(all_params) param1, param2 = [ d.get("W") for d in (all_params, p.model.feature_extractor.params) ] if param1 is not None and param2 and param2.init is not None and not config.args.update_word_vectors: assert_allclose(param1, weight_decay(p.model) * param2.init, rtol=1e-6) text_results = results = list(p.parse(passages, evaluate=evaluate)) if text: print("Converting to text and parsing...") text_results = list( p.parse([ p3 for p1 in passages for p2 in convert.to_text(p1, sentences=False) for p3 in convert.from_text( p2, p1.ID, extra_format=p1.extra.get("format")) ])) assert len(results) == len(text_results) if evaluate: scores.append(Scores(tuple(zip(*results))[1]).average_f1()) if text: for t, (r, s) in zip(text_results, results): print(" %s F1=%.3f" % (r.ID, s.average_f1())) assert not list(p.parse(())) # parsing nothing returns nothing print() assert_all_params_equal(*params) if evaluate: print("-- average f1: %.3f, %.3f\n" % tuple(scores)) assert scores[0] == pytest.approx(scores[1], 0.1)
def upload_task(self, passage, log=None, submit=True, ids=None, upload=True): if ids: passage_id, tok_id, ann_id = ids[passage.ID] passage_out = self.get_passage(passage_id) tok_user_task_out = tok_task_out = self.get_user_task(tok_id) ann_user_task_in = self.get_user_task(ann_id) else: passage_out = self.create_passage( text=to_text(passage, sentences=False)[0], type="PUBLIC", source=self.source, external_id=passage.ID) if upload else passage task_in = dict(type="TOKENIZATION", status="ONGOING", project=self.project, user=self.user, passage=passage_out, manager_comment=passage.ID, user_comment=passage.ID, parent=None, is_demo=False, is_active=True) tok_task_out = self.create_task(**task_in) if upload else task_in tok_user_task_in = dict(tok_task_out) tok_user_task_in.update( to_json(passage, return_dict=True, tok_task=True)) tok_user_task_out = self.submit_task( **tok_user_task_in) if upload else tok_user_task_in task_in.update(parent=tok_task_out, type="ANNOTATION") ann_user_task_in = self.create_task( **task_in) if upload else task_in ann_user_task_in.update( to_json(passage, return_dict=True, tok_task=tok_user_task_out, all_categories=self.layer["categories"])) ann_user_task_out = self.submit_task( **ann_user_task_in, submit=submit) if upload else ann_user_task_in if log: print(passage.ID, passage_out["id"], tok_task_out["id"], ann_user_task_out["id"], file=log, sep="\t", flush=True) return ann_user_task_out
def upload_task(self, passage): passage_out = self.create_passage(text=to_text(passage, sentences=False)[0], type="PUBLIC", source=self.source) task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user, passage=passage_out, manager_comment=passage.ID, user_comment=passage.ID, parent=None, is_demo=False, is_active=True) tok_task_out = self.create_tokenization_task(**task_in) tok_user_task_in = dict(tok_task_out) tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True)) tok_user_task_out = self.submit_tokenization_task(**tok_user_task_in) task_in.update(parent=tok_task_out, type="ANNOTATION") ann_user_task_in = self.create_annotation_task(**task_in) ann_user_task_in.update( to_json(passage, return_dict=True, tok_task=tok_user_task_out, all_categories=self.layer["categories"])) return self.submit_annotation_task(**ann_user_task_in)
def train_test(self, model_type, compare=True): scores = [] p = None for mode in "train", "load": print("-- %sing %s" % (mode, model_type)) p = Parser(model_file="test_files/models/%s" % model_type, model_type=model_type) p.train(self.load_passages() if mode == "train" else None, iterations=200) score = evaluation.Scores.aggregate([s for _, s in p.parse(self.load_passages(), evaluate=True)]) scores.append(score.average_f1()) print() print("-- average labeled f1: %.3f, %.3f\n" % tuple(scores)) if compare: self.assertAlmostEqual(*scores) p.parse(convert.to_text(self.load_passages()[0])) self.assertFalse(list(p.parse(()))) # parsing nothing returns nothing
def main(args): for passage in get_passages_with_progress_bar(args.passages): t = split2sentences(passage) i = 0 for sen in t: #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen))) print('sentence %d\n\n%s\n' % (i, convert.to_text(sen))) i += 1 compunds = [] for node in sen.nodes: if (sen.nodes[node].layer.ID == '0'): find_id = '' l = sen.nodes[node] if (l.parents[0].ftag == 'C'): if (l.parents[0].ID not in compunds): compunds.append(l.parents[0].ID) tmp_c = [] for n in l.parents[0].children: tmp_c.append(n.text) #print('Word: %s\nWord ID: %s' %(tmp_c,l.parents[0].ID)) find_id = l.parents[0].ID path = [] path.append(' '.join(tmp_c)) path = find_path(sen.nodes[find_id], path) print(' '.join(path)) ''' for j in path: print(j) ''' print('-------') else: #print('Word: %s\nWord ID: %s' % (l.text, l.ID)) find_id = l.ID path = [] path = find_path(sen.nodes[find_id], path) print(' '.join(path)) ''' for j in path: print(j) ''' print('-------') print( '------------------------------------------------------------------' )
def show_query(fnode): print('\n'.join(convert.to_text(fnode.root))) print(str(fnode.get_top_scene())) print(str(fnode))
def write_text(passage, f, sentences, lang, prepend_id=False): for line in to_text(passage, sentences=sentences, lang=lang): fields = [passage.ID, line] if prepend_id else [line] print(*fields, file=f, sep="\t")
def test_to_text(self): passage = convert.from_standard(TestUtil.load_xml("test_files/standard3.xml")) self.assertEqual(convert.to_text(passage, False)[0], "1 2 3 4 . 6 7 8 9 10 . 12 13 14 15") self.assertSequenceEqual(convert.to_text(passage, True), ["1 2 3 4 .", "6 7 8 9 10 .", "12 13 14 15"])
def test_to_text(): passage = loaded() assert convert.to_text(passage, False)[0] == "1 2 3 4 . 6 7 8 9 10 . 12 13 14 15" assert convert.to_text( passage, True) == ["1 2 3 4 .", "6 7 8 9 10 .", "12 13 14 15"]
def test_to_text(self): passage = convert.from_standard(self._load_xml('./standard3.xml')) self.assertEqual(convert.to_text(passage, False)[0], '1 2 3 4 . 6 7 8 9 10 . 12 13 14 15') self.assertSequenceEqual(convert.to_text(passage, True), ['1 2 3 4 .', '6 7 8 9 10 .', '12 13 14 15'])
def main(args): with open(args.outfile, "w", encoding="utf-8") as f: for passage in get_passages_with_progress_bar(sorted(args.filenames, key=numeric), desc="Converting to text"): for line in to_text(passage, lang=args.lang): print(line, file=f) print("Wrote '%s'." % args.outfile)
def write_text(passage, f, lang): for line in to_text(passage, lang=lang): print(line, file=f)
def test_to_text(self): passage = convert.from_standard( TestUtil.load_xml('test_files/standard3.xml')) self.assertEqual( convert.to_text(passage, False)[0], '1 2 3 4 . 6 7 8 9 10 . 12 13 14 15') self.assertSequenceEqual(convert.to_text(passage, True), ['1 2 3 4 .', '6 7 8 9 10 .', '12 13 14 15'])
def test_to_text(self): passage = convert.from_standard( TestUtil.load_xml("test_files/standard3.xml")) self.assertEqual( convert.to_text(passage, False)[0], "1 2 3 4 . 6 7 8 9 10 . 12 13 14 15") self.assertSequenceEqual(convert.to_text(passage, True), ["1 2 3 4 .", "6 7 8 9 10 .", "12 13 14 15"])
def main(args): for passage in get_passages_with_progress_bar(args.passages): #passage = convert.from_standard(elem) #print("Linearised\n------\n") # print(convert.to_sequence(passage)) words = {} xmltoconll(passage) t = split2sentences(passage) i = 0 for sen in t: print('sentence %d\n\n%s\n' % (i, convert.to_text(sen))) i += 1 while (1): word = input('\nType the word below\n\n') for node in passage.nodes: t = passage.nodes[node] if (re.match(rf'\b{word}\b', t.text, re.IGNORECASE)): #print('Word: %s\nWord ID: %s' %(t.text,t.ID)) #ans = input('\nDo you want to continue with wordi Id : %s', t.ID) path = [] path = find_path(passage.nodes[t.ID], path) break print(' '.join(path))
def main(args): for passage in get_passages_with_progress_bar(args.passages): t = split2sentences(passage) sen_no = 0 for sen in t: #print('sentence %d\n\n%s\n%s' %(i,convert.to_text(sen), convert.to_sequence(sen))) print('sentence %d\n\n%s\n' % (sen_no, convert.to_text(sen))) root = sen.nodes['1.1'] first = 1 tab_len = {} tab_len[0] = len('1.1') for i in root.children: print('\n') path = [] level = 1 path.append((i.ftag, i.ID, level, False)) path = find_children(i, path, level) end = 0 if (first): pstr = root.ID first = 0 else: for k in range(0, tab_len[0]): pstr = pstr + ' ' for j in path: if (j == 'End'): print(pstr) pstr = '' end = 1 continue rel = j[0] nd = j[1] tab = int(j[2]) remote = j[3] if (end): q_mark = 0 for k in range(0, tab_len[tab - 1]): if (k == tab_len[q_mark]): pstr = pstr + '.' q_mark += 1 else: pstr = pstr + ' ' end = 0 if (rel in descr): rel_desc = rel + ':' + descr[rel] else: rel_desc = rel if (remote): pstr = pstr + '|-->Remote(' + rel_desc + ')-->' + nd else: pstr = pstr + '|-->(' + rel_desc + ')-->' + nd tab_len[tab] = len(pstr) print('-----------------------------------\n') sen_no += 1