def upload_passage(self, external_id, tokens): assert external_id, "Missing external ID for passage %s" % tokens assert tokens, "Empty passage %s" % external_id passage_out = self.create_passage(text=" ".join(tokens), external_id=external_id, type="PUBLIC", source=self.source) task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user, passage=passage_out, manager_comment="External ID: " + external_id, user_comment="", parent=None, is_demo=False, is_active=True) tok_task_out = self.create_tokenization_task(**task_in) tok_user_task_in = dict(tok_task_out) passage = list(from_text(tokens, tokenized=True))[0] tok_user_task_in.update( to_json(passage, return_dict=True, tok_task=True)) self.submit_tokenization_task(**tok_user_task_in) task_in = dict(type="ANNOTATION", status="NOT_STARTED", project=self.project, user=self.annotation_user, passage=tok_task_out["passage"], manager_comment="External ID: " + external_id, user_comment="", parent=tok_task_out, is_demo=False, is_active=True) self.create_annotation_task(**task_in) print("Uploaded passage " + external_id + " successfully")
def _ucca_parse_text(text, output_dir, filename, clean, normalize_sentence, model_path): # text = [normalize_sentence(x) for x in text] # text = from_text(text, split=True, one_per_line=True) # text = list(text) # By pass the UCCA tokenizor text = [ next( from_text(normalize_sentence(val).split(' '), passage_id=idx, tokenized=True)) for idx, val in enumerate(text) ] # print(text) parser = get_parser(model_path) out_location = os.path.dirname(parse_location(output_dir, filename, 0)) if not os.path.isdir(out_location): os.makedirs(out_location) for i, (passage, *_) in enumerate(parser.parse(text)): passage2file(passage, parse_location(output_dir, filename, i)) # create an empty file anounces parsing finished succsessfuly parsed_file = os.path.join(out_location, PARSED_FILE) with open(parsed_file, "w") as _: pass if clean: filenames = os.listdir(output_dir) for filename in filenames: if filename.endswith(".txt"): os.remove(os.path.join(output_dir, item))
def parse_sentence(self, sentence): reg_stdout = sys.stdout sys.stdout = open(os.devnull, 'w', encoding='UTF-8') parsed_passage = None try: TupaParser.__passage_counter = +1 passage_id = TupaParser.__passage_counter = +1 # from_text will convert the sentence into a ucca structure. # annotate_all will annotate the structure with information from the Spacy parse. # annotate_all returns a generator - one that will yield only one object - hence # we call next unparsed_passage = next( annotate_all(from_text(sentence, passage_id, one_per_line=True))) # The 'tupa.parse class's parse method expects a list of unparsed-message. We also need to set # the 'evaluate' argument to True, otherwise we get incorrect results. (Ofir Arviv advised as such). # The parse method also returns a generator, hence the need to call next. # The actual object returned is a tuple of the parsed-passage and an internal score object. We're # not interested in the score though, so we just extract the parsed-passage parsed_passage_and_score = next( self.__parser.parse([unparsed_passage], evaluate=True)) internal_parsed_passage = parsed_passage_and_score[0] parsed_passage = TupaParser.__get_ucca_parsed_passage_from_passage( internal_parsed_passage) finally: sys.stdout = reg_stdout return parsed_passage
def ucca_parse_sentences(sentences, model_path='models/ucca-bilstm', model=None, lang='en', to_save=True): get_parsed_sent() sentences = [ normalize_sentence(sentence, lang=lang) for sentence in sentences ] to_parse = [] for i in range( len(sentences) ): # check the preprocess pickle file to see if any update is needed if sentences[i] in PARSED_SENT: sentences[i] = PARSED_SENT[sentences[i]] elif len(sentences[i].strip()) == 0: sentences[i] = NoSentence() else: to_parse.append((i, sentences[i])) if len(to_parse) > 0: print("Parsing", len(to_parse), "sentences.", len(sentences) - len(to_parse), "sentences already parsed.") if model is None: parser = get_parser(model_path) else: parser = model ids, text = zip(*to_parse) text = list(from_text(text, split=True, one_per_line=True, lang=lang)) for i, (passage, *_) in enumerate(parser.parse(text)): PARSED_SENT[sentences[ids[i]]] = passage sentences[ids[i]] = passage if to_save: save_parsed_sent() return sentences
def parse(): text = request.values["input"] print("Parsing text: '%s'" % text) in_passage = next(from_text(text)) out_passage = next(get_parser().parse(in_passage))[0] root = to_standard(out_passage) xml = tostring(root).decode() return Response(indent_xml(xml), headers={"Content-Type": "xml/application"})
def _init_passages(self, amrs): for lines, amr_id, tokens in amrs: assert tokens is not None, "Cannot convert AMR without input tokens: %s" % lines amr = parse(" ".join(lines), tokens=tokens) amr_id = amr_id or self.passage_id passage = next(convert.from_text(tokens, amr_id, tokenized=True)) passage.extra["format"] = "amr" yield passage, amr, amr_id
def parse(): text = request.values["input"] print("Parsing text: '%s'" % text) in_passage = next(from_text(text)) out_passage = next(get_parser().parse(in_passage))[0] root = to_standard(out_passage) xml = tostring(root).decode() return Response(indent_xml(xml), headers={"Content-Type": "xml/application"})
def test_from_text(self): sample = ['Hello . again', 'nice', ' ?! end', ''] passage = convert.from_text(sample) terms = passage.layer(layer0.LAYER_ID).all pos = 0 for i, par in enumerate(sample): for text in par.split(): self.assertTrue(terms[pos].text == text and terms[pos].paragraph == i + 1) pos += 1
def test_from_text(self): sample = ["Hello . again", "nice", " ? ! end", ""] passage = next(convert.from_text(sample)) terms = passage.layer(layer0.LAYER_ID).all pos = 0 for i, par in enumerate(sample): for text in par.split(): self.assertEqual(terms[pos].text, text) self.assertEqual(terms[pos].paragraph, i + 1) pos += 1
def test_from_text(self): sample = ['Hello . again', 'nice', ' ? ! end', ''] passage = next(convert.from_text(sample)) terms = passage.layer(layer0.LAYER_ID).all pos = 0 for i, par in enumerate(sample): for text in par.split(): self.assertTrue(terms[pos].text == text and terms[pos].paragraph == i + 1) pos += 1
def test_from_text(): sample = ["Hello . again", "nice", " ? ! end", ""] passage = next(convert.from_text(sample)) terms = passage.layer(layer0.LAYER_ID).all pos = 0 for i, par in enumerate(sample): for text in par.split(): assert terms[pos].text == text assert terms[pos].paragraph == i + 1 pos += 1
def test_from_text(self): sample = ["Hello . again", "nice", " ? ! end", ""] passage = next(convert.from_text(sample)) terms = passage.layer(layer0.LAYER_ID).all pos = 0 for i, par in enumerate(sample): for text in par.split(): self.assertEqual(terms[pos].text, text) self.assertEqual(terms[pos].paragraph, i + 1) pos += 1
def _init_passages(self, graphs, **kwargs): for graph in graphs: if not graph.id: graph.id = graph.id or self.passage_id passage = next( convert.from_text(graph.tokens, graph.id, tokenized=True)) graph.format = kwargs.get("format") or graph.format if graph.format is None or graph.format == self.format: passage.extra["format"] = self.format yield passage, graph
def upload_streussel_passage_file(self, filenames, log=None, **kwargs): del kwargs log_h = open(log, "w", encoding="utf-8") if log else None with open(filenames) as f_all: for filename in f_all: passage_text = "" external_id = "None given" filename = filename.strip() with open(filename, encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue elif line.startswith("#"): fields = line.split() if len(fields) != 4 or fields[1] != "sent_id": print("FORMAT ERROR in " + filename, file=sys.stderr) else: external_id = fields[3].split("-")[1] else: passage_text = passage_text + " " + line passage_out = self.create_passage(text=passage_text.strip(), external_id=external_id, type="PUBLIC", source=self.source) task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user, passage=passage_out, manager_comment="External ID: " + external_id, user_comment="", parent=None, is_demo=False, is_active=True) tok_task_out = self.create_task(**task_in) tok_user_task_in = dict(tok_task_out) passage = list(from_text(passage_text.split(), tokenized=True))[0] tok_user_task_in.update( to_json(passage, return_dict=True, tok_task=True)) self.submit_task(**tok_user_task_in) print("Uploaded passage " + filename + " successfully.", file=sys.stderr) if log: print(filename.split(".")[-2], passage_out["id"], tok_task_out["id"], file=log_h, sep="\t") if log: log_h.close()
def test_from_text_long(self): sample = """ After graduation, John moved to New York City. He liked it there. He played tennis. And basketball. And he lived happily ever after. """ passages = list(convert.from_text(sample)) self.assertEqual(len(passages), 3, list(map(convert.to_text, passages)))
def test_from_text_long(): sample = """ After graduation, John moved to New York City. He liked it there. He played tennis. And basketball. And he lived happily ever after. """ passages = list(convert.from_text(sample)) assert len(passages) == 3, list(map(convert.to_text, passages))
def test_parser(config, model_type, formats, default_setting, text=True): filename = "test_files/models/%s_%s%s" % ("_".join(formats), model_type, default_setting.suffix()) remove_existing(filename) config.update(default_setting.dict()) scores = [] params = [] passages = list(map(load_passage, passage_files(*formats))) evaluate = ("amr" not in formats) for mode in "train", "load": print("-- %sing %s" % (mode, model_type)) config.update(dict(classifier=model_type, copy_shared=None)) p = Parser(model_files=filename, config=config) p.save_init = True list( p.train(passages if mode == "train" else None, dev=passages, test=True, iterations=2)) assert p.model.is_finalized, "Model should be finalized after %sing" % mode assert not getattr(p.model.feature_extractor, "node_dropout", 0), p.model.feature_extractor.node_dropout all_params = p.model.all_params() params.append(all_params) param1, param2 = [ d.get("W") for d in (all_params, p.model.feature_extractor.params) ] if param1 is not None and param2 and param2.init is not None and not config.args.update_word_vectors: assert_allclose(param1, weight_decay(p.model) * param2.init, rtol=1e-6) text_results = results = list(p.parse(passages, evaluate=evaluate)) if text: print("Converting to text and parsing...") text_results = list( p.parse([ p3 for p1 in passages for p2 in convert.to_text(p1, sentences=False) for p3 in convert.from_text( p2, p1.ID, extra_format=p1.extra.get("format")) ])) assert len(results) == len(text_results) if evaluate: scores.append(Scores(tuple(zip(*results))[1]).average_f1()) if text: for t, (r, s) in zip(text_results, results): print(" %s F1=%.3f" % (r.ID, s.average_f1())) assert not list(p.parse(())) # parsing nothing returns nothing print() assert_all_params_equal(*params) if evaluate: print("-- average f1: %.3f, %.3f\n" % tuple(scores)) assert scores[0] == pytest.approx(scores[1], 0.1)
def _build_passage(self): assert self.tokens is not None, "Cannot convert AMR without input tokens" # amr = penman.decode(re.sub("~e\.[\d,]+", "", " ".join(self.lines))) amr = parse(" ".join(self.lines), tokens=self.tokens) passage = next(convert.from_text(self.tokens, self.amr_id or self.passage_id, tokenized=True)) passage.extra["format"] = "amr" self.lines = [] self.amr_id = self.tokens = None textutil.annotate(passage) l0 = passage.layer(layer0.LAYER_ID) l1 = passage.layer(layer1.LAYER_ID) self._build_layer1(amr, l1) self._build_layer0(self.align_nodes(amr), l1, l0) self._update_implicit(l1) self._update_labels(l1) # return (passage, penman.encode(amr), self.amr_id) if self.return_amr else passage return (passage, amr(alignments=False), self.amr_id) if self.return_amr else passage
def upload_passage(self, external_id, tokens): assert external_id, "Missing external ID for passage %s" % tokens assert tokens, "Empty passage %s" % external_id passage_out = self.create_passage(text=" ".join(tokens), external_id=external_id, type="PUBLIC", source=self.source) task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user, passage=passage_out, manager_comment="External ID: "+external_id, user_comment="", parent=None, is_demo=False, is_active=True) tok_task_out = self.create_task(**task_in) tok_user_task_in = dict(tok_task_out) passage = list(from_text(tokens, tokenized=True))[0] tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True)) self.submit_task(**tok_user_task_in) task_in = dict(type="ANNOTATION", status="NOT_STARTED", project=self.project, user=self.annotation_user, passage=tok_task_out["passage"], manager_comment="External ID: "+external_id, user_comment=external_id, parent=tok_task_out, is_demo=False, is_active=True) self.create_task(**task_in) print("Uploaded passage "+external_id+" successfully")
def tokenize_and_upload(self, filename, log=None, lang=None, **kwargs): del kwargs log_h = open(log, "w", encoding="utf-8") if log else None prefix = os.path.splitext(os.path.basename(filename))[0].replace( " ", "_") with open(filename, encoding="utf-8") as f: for passage, text in from_text(f, passage_id=prefix, lang=lang, return_text=True): passage_out = self.create_passage(text=text, type="PUBLIC", source=self.source) task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user, passage=passage_out, manager_comment=passage.ID, user_comment="", parent=None, is_demo=False, is_active=True) tok_task_out = self.create_task(**task_in) tok_user_task_in = dict(tok_task_out) tok_user_task_in.update( to_json(passage, return_dict=True, tok_task=True)) self.submit_task(**tok_user_task_in) task_in.update(parent=tok_task_out, type="ANNOTATION") ann_user_task_out = self.create_task(**task_in) print("Uploaded passage " + filename + " successfully.", file=sys.stderr) if log: print(passage.ID, passage_out["id"], tok_task_out["id"], ann_user_task_out["id"], file=log_h, sep="\t", flush=True) if log: log_h.close()
def upload_streussel_passage_file(self, filenames, log=None, **kwargs): del kwargs log_h = open(log, "w", encoding="utf-8") if log else None with open(filenames) as f_all: for filename in f_all: passage_text = "" external_id = "None given" filename = filename.strip() with open(filename, encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue elif line.startswith("#"): fields = line.split() if len(fields) != 4 or fields[1] != "sent_id": print("FORMAT ERROR in " + filename, file=sys.stderr) else: external_id = fields[3].split("-")[1] else: passage_text = passage_text + " " + line passage_out = self.create_passage(text=passage_text.strip(), external_id=external_id, type="PUBLIC", source=self.source) task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user, passage=passage_out, manager_comment="External ID: " + external_id, user_comment="", parent=None, is_demo=False, is_active=True) tok_task_out = self.create_task(**task_in) tok_user_task_in = dict(tok_task_out) passage = list(from_text(passage_text.split(), tokenized=True))[0] tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True)) self.submit_task(**tok_user_task_in) print("Uploaded passage " + filename + " successfully.", file=sys.stderr) if log: print(filename.split(".")[-2], passage_out["id"], tok_task_out["id"], file=log_h, sep="\t") if log: log_h.close()
def _ucca_parse_text(text, output_dir, filename, clean, normalize_sentence): text = [normalize_sentence(x) for x in text] # print("parsing", text) text = from_text(text, split=True, one_per_line=True) text = list(text) # print("output_dir", output_dir) # print(filename, "filename") # print("parsed to", parse_location( # output_dir, filename, 0)) # raise parser = get_parser() for i, passage in enumerate(parser.parse(text)): passage2file(passage, parse_location(output_dir, filename, i)) # create a file anounces parsing finished succsessfuly parsed_file = os.path.join( os.path.dirname(parse_location(output_dir, filename, 0)), PARSED_FILE) with open(parsed_file, "w") as _: pass if clean: filenames = os.listdir(output_dir) for filename in filenames: if filename.endswith(".txt"): os.remove(os.path.join(output_dir, item))