def main(args): if not args.as_array and not args.as_extra: args.as_extra = True for spec in read_specs(args, converters=FROM_FORMAT_NO_PLACEHOLDERS): kwargs = dict(as_array=args.as_array, as_extra=args.as_extra, verbose=args.verbose, lang=spec.lang) passages = spec.passages if spec.conllu: passages = copy_annotation(passages, spec.conllu, by_id=args.by_id, **kwargs) elif spec.udpipe: passages = annotate_udpipe(passages, spec.udpipe, **kwargs) elif spec.stanfordnlp: passages = annotate_stanfordnlp(passages, spec.stanfordnlp, **kwargs) for passage in annotate_all(passages if args.verbose else tqdm( passages, unit=" passages", desc="Annotating " + spec.out_dir), replace=spec.conllu or not (spec.udpipe or spec.stanfordnlp), **kwargs): if passage.extra.get("format") == "amr" and args.as_array: from semstr.conversion.amr import AmrConverter AmrConverter.introduce_placeholders(passage) write_passage(passage, outdir=spec.out_dir, verbose=args.verbose, binary=args.binary)
def finish(self, status, display=True, write=False, accuracies=None): self.model.classifier.finished_item(self.training) for model in self.models[1:]: model.classifier.finished_item(renew=False) # So that dynet.renew_cg happens only once if not self.training or self.config.args.verify: self.out = self.state.create_passage(verify=self.config.args.verify, format=self.out_format) if write: for out_format in self.config.args.formats or [self.out_format]: if self.config.args.normalize and out_format == "ucca": normalize(self.out) ioutil.write_passage(self.out, output_format=out_format, binary=out_format == "pickle", outdir=self.config.args.outdir, prefix=self.config.args.prefix, converter=get_output_converter(out_format), verbose=self.config.args.verbose, append=self.config.args.join, basename=self.config.args.join) if self.oracle and self.config.args.verify: self.verify(self.out, self.passage) ret = (self.out,) if self.evaluation: ret += (self.evaluate(self.evaluation),) status = "%-14s %s F1=%.3f" % (status, self.eval_type, self.f1) if display: self.config.print("%s%.3fs %s" % (self.accuracy_str, self.duration, status), level=1) if accuracies is not None: accuracies[self.passage.ID] = self.correct_action_count / self.action_count if self.action_count else 0 return ret
def download_task(self, task_id, normalize=False, write=True, validate=None, binary=None, log=None, out_dir=None, prefix=None, by_external_id=False, verbose=False, write_valid_only=False, **kwargs): del kwargs task = self.get_user_task(task_id) user_id = task["user"]["id"] try: passage = from_json(task, by_external_id=by_external_id) except ValueError as e: raise ValueError("Failed reading json for task %s:\n%s" % (task_id, json.dumps(task))) from e if normalize: try: normalization.normalize(passage) except AssertionError as e: raise ValueError("Failed normalizing task %s:\n%s" % (task_id, json.dumps(task))) from e if log: print(passage.ID, task_id, user_id, task["user_comment"], task["created_at"], task["updated_at"], file=log, sep="\t", flush=True) ret = passage, task_id, user_id if validate or write_valid_only: for error in validation.validate(passage, linkage=False): if validate: print(passage.ID, task_id, user_id, error, file=validate, sep="\t", flush=True) if write_valid_only: return ret if write: write_passage(passage, binary=binary, outdir=out_dir, prefix=prefix, verbose=verbose) return ret
def main(args): os.makedirs(args.outdir, exist_ok=True) with open(args.filename, encoding="utf-8") as f: t = list(map(str.split, f)) if not args.verbose: t = tqdm(t, desc="Downloading", unit=" passages") for passage_id, id_field in t: if not args.verbose: t.set_postfix({ "passage_id": passage_id, args.method: id_field }) if args.verbose: with external_write_mode(): print("Getting passage " + passage_id + " with " + args.method + "=" + id_field, end="\t") xml_root = get_by_method(id_field=id_field.split(","), passage_id=passage_id, **vars(args)) if xml_root is None: continue if args.write_site: site_filename = passage_id + "_site_download.xml" with open(site_filename, "w", encoding="utf-8") as fsite: print(tostring(xml_root).decode(), file=fsite) if args.verbose: with external_write_mode(): print("Wrote '%s'" % site_filename) if args.write: write_passage(convert.from_site(xml_root), outdir=args.outdir, verbose=args.verbose)
def main(filename, input_filenames, outdir): os.makedirs(outdir, exist_ok=True) with open(filename, encoding="utf-8") as f: pairs = [line.strip().split() for line in f] old_to_new_id = {old_id: new_id for new_id, old_id in pairs} for passage in get_passages_with_progress_bar(input_filenames, desc="Renaming"): passage._ID = old_to_new_id[passage.ID] write_passage(passage, outdir=outdir, verbose=False)
def main(args): for passage in annotate_all(get_passages_with_progress_bar( args.filenames, desc="Annotating"), replace=True, as_array=args.as_array, verbose=args.verbose): assert is_annotated( passage, args.as_array), "Passage %s is not annotated" % passage.ID write_passage(passage, outdir=args.out_dir, verbose=args.verbose)
def main(args): os.makedirs(args.outdir, exist_ok=True) with open(args.outfile, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(("rule", "passage", "terminal", "before", "after")) for passage in get_passages_with_progress_bar(args.passages, desc="Converting"): convert_passage(passage, lang=passage.attrib.get("lang", args.lang), report_writer=writer) write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose) f.flush() print("Wrote '%s'" % args.outfile)
def main(args): os.makedirs(args.outdir, exist_ok=True) with open(args.outfile, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(("rule", "passage", "edge", "before", "after")) for passage in get_passages_with_progress_bar(args.passages, desc="Converting"): convert_passage(passage, report_writer=writer) write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose) f.flush() print("Wrote '%s'" % args.outfile)
def main(args): os.makedirs(args.outdir, exist_ok=True) words_set = read_dict(args.words_set) with open(args.logfile, "w", newline="", encoding="utf-8") as outfile: cw = csv.writer(outfile) for passage in get_passages_with_progress_bar(args.filenames, "Fixing tokenization"): fixed = fix_tokenization(passage, words_set, lang=args.lang, cw=cw) if fixed is not None: outfile.flush() normalize(fixed) write_passage(fixed, outdir=args.outdir, binary=args.binary, prefix=args.prefix, verbose=args.verbose)
def main(args): for passages, out_dir, lang in read_specs(args): for passage in tqdm(passages, unit=" passages", desc="Setting language in " + out_dir, postfix={"lang": lang}): passage.attrib["lang"] = lang write_passage(passage, outdir=out_dir, verbose=False, binary=args.binary)
def main(args): for spec in read_specs(args, converters=FROM_FORMAT): if spec.udpipe: spec.passages = annotate_udpipe(spec.passages, spec.udpipe, as_array=args.as_array, verbose=args.verbose) elif spec.conllu: spec.passages = copy_annotation(spec.passages, spec.conllu, as_array=args.as_array, verbose=args.verbose) for passage in annotate_all(spec.passages if args.verbose else tqdm(spec.passages, unit=" passages", desc="Annotating " + spec.out_dir), as_array=args.as_array, replace=not spec.udpipe, lang=spec.lang, verbose=args.verbose): write_passage(passage, outdir=spec.out_dir, verbose=args.verbose, binary=args.binary)
def main(args): for passages, out_dir, lang in read_specs(args): for passage in annotate_all(passages if args.verbose else tqdm( passages, unit=" passages", desc="Annotating " + out_dir), as_array=args.as_array, replace=True, lang=lang, verbose=args.verbose): write_passage(passage, outdir=out_dir, verbose=args.verbose, binary=args.binary)
def main(args): if args.outdir: os.makedirs(args.outdir, exist_ok=True) for p in get_passages_with_progress_bar(args.filenames, desc="Normalizing", converters={}): normalize(p, extra=args.extra) write_passage(p, outdir=args.outdir, prefix=args.prefix, binary=args.binary, verbose=False)
def train_test(train_passages, dev_passages, test_passages, args, model_suffix=""): """ Train and test parser on given passage :param train_passages: passage to train on :param dev_passages: passages to evaluate on every iteration :param test_passages: passages to test on after training :param args: extra argument :param model_suffix: string to append to model filename before file extension :return: generator of Scores objects: dev scores for each training iteration (if given dev), and finally test scores """ model_base, model_ext = os.path.splitext( args.model or "%s_%s" % (args.format or "ucca", args.classifier)) p = Parser(model_file=model_base + model_suffix + model_ext, model_type=args.classifier, beam=args.beam) print("%s %s" % (os.path.basename(__file__), Config())) yield from filter( None, p.train(train_passages, dev=dev_passages, iterations=args.iterations)) if test_passages: if args.train or args.folds: print("Evaluating on test passages") passage_scores = [] evaluate = args.evaluate or train_passages for result in p.parse(test_passages, evaluate=evaluate): if evaluate: guessed_passage, score = result passage_scores.append(score) else: guessed_passage = result print() if guessed_passage is not None and args.write: ioutil.write_passage(guessed_passage, output_format=args.output_format, binary=args.output_format == "pickle", outdir=args.outdir, prefix=args.prefix, converter=TO_FORMAT.get( args.output_format, Config().output_converter or to_text)) if passage_scores: scores = Scores(passage_scores) if args.verbose <= 1 or len(passage_scores) > 1: print("\nAverage labeled F1 score on test: %.3f" % scores.average_f1()) print("Aggregated scores:") scores.print() print_scores(scores, args.testscores) yield scores
def main(args): textutil.BATCH_SIZE = 1 os.makedirs(args.outdir, exist_ok=True) with open(args.outfile, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(("rule", "passage", "terminal", "pos", "before", "after")) for passage in annotate_all(get_passages_with_progress_bar(args.passages, desc="Converting"), verbose=args.verbose): convert_passage(passage, report_writer=writer) write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose) f.flush() print("Wrote '%s'" % args.outfile)
def main(args): os.makedirs(args.out_dir, exist_ok=True) exceptions = [] for pattern in args.filenames: for filename in sorted(glob(pattern)) or [pattern]: print("Reading '%s'..." % filename) try: passage = pickle_site2passage(filename) write_passage(passage, outdir=args.out_dir, binary=args.binary, basename=os.path.basename(filename)) except ValueError as e: exceptions.append((filename, e)) if exceptions: for filename, e in exceptions: print("'%s': %s" % (filename, e))
def main(args): for i, line in enumerate(tqdm(gen_lines(args.filenames), unit=" lines", desc="Creating passages"), start=1): p = core.Passage(args.format % i) l0 = layer0.Layer0(p) layer1.Layer1(p) for tok in line.split(): l0.add_terminal(text=tok, punct=PUNCTUATION.issuperset(tok)) write_passage(p, outdir=args.out_dir, binary=args.binary, verbose=False)
def train_test(train_passages, dev_passages, test_passages, args, model_suffix=""): """ Train and test parser on given passage :param train_passages: passage to train on :param dev_passages: passages to evaluate on every iteration :param test_passages: passages to test on after training :param args: extra argument :param model_suffix: string to append to model filename before file extension :return: pair of (test scores, list of dev scores per iteration) where each one is a Scores object """ test_scores = None model_base, model_ext = os.path.splitext(args.model or "ucca_" + args.classifier) p = Parser(model_file=model_base + model_suffix + model_ext, model_type=args.classifier, beam=args.beam) p.train(train_passages, dev=dev_passages, iterations=args.iterations) if test_passages: if args.train or args.folds: print("Evaluating on test passages") passage_scores = [] evaluate = args.evaluate or train_passages for result in p.parse(test_passages, evaluate=evaluate): if evaluate: guessed_passage, score = result passage_scores.append(score) else: guessed_passage = result print() if guessed_passage is not None and not args.no_write: ioutil.write_passage(guessed_passage, output_format=args.format, binary=args.binary, outdir=args.outdir, prefix=args.prefix) if passage_scores and (not args.verbose or len(passage_scores) > 1): test_scores = evaluation.Scores.aggregate(passage_scores) print("\nAverage labeled F1 score on test: %.3f" % test_scores.average_f1()) print("Aggregated scores:") test_scores.print() if Config().args.testscores: with open(Config().args.testscores, "a") as f: print(",".join(test_scores.fields()), file=f) return test_scores, p.dev_scores
def download_task(self, task_id, write=True, out_format=None, binary=None, out_dir=None, prefix=None, **kwargs): del kwargs passage = from_json(self.get_user_task(task_id), all_categories=self.layer["categories"]) if write: write_passage(passage, out_format, binary, out_dir, prefix, TO_FORMAT.get(out_format)) return passage
def main(args): os.makedirs(args.out_dir, exist_ok=True) exceptions = [] for pattern in args.filenames: for filename in glob(pattern) or [pattern]: print("Reading '%s'..." % filename) try: passage = pickle_site2passage(filename) write_passage(passage, outdir=args.out_dir, binary=args.binary, basename=os.path.basename(filename)) except ValueError as e: exceptions.append((filename, e)) if exceptions: for filename, e in exceptions: print("'%s': %s" % (filename, e))
def main(args): os.makedirs(args.out_dir, exist_ok=True) for filename, passage in ((filename, site2passage(filename)) for pattern in args.filenames for filename in sorted(glob(pattern)) or [pattern]) if args.filenames \ else ((pid, db2passage(sqlite3.connect(args.db).cursor(), pid, args.user)) for pid in args.pids): write_passage(passage, outdir=args.out_dir, binary=args.binary)
def main(args): for passage in annotate_all(get_passages_with_progress_bar(args.filenames, desc="Annotating"), replace=True, as_array=args.as_array, verbose=args.verbose): assert is_annotated(passage, args.as_array), "Passage %s is not annotated" % passage.ID write_passage(passage, outdir=args.out_dir, verbose=args.verbose)
def get_validation_accuracy(val_text_tensor, model, a_model, label_model, s_model, rm_model, rm_lstm_model, val_text, val_passages, val_pos, val_pos_tensor, labels, label2index, val_ent, val_ent_tensor, val_case_tensor, unroll, eval_type="unlabeled", testing=False, testing_phase=False): total_labeled = (0, 0, 0) total_unlabeled = (0, 0, 0) total_labeled_remote = (0, 0, 0) total_unlabeled_remote = (0, 0, 0) top_10_to_writeout = 10 debugging_remote = 0 debugging_remote_min = 0 debugging_remote_max = -1 if debugging_remote_max > -1 or debugging_remote_min > 0: print("WARNING: Only test on part of sents") for sent_tensor, ori_sent, tgt_passage, pos, pos_tensor, ent, ent_tensor, case_tensor in \ zip(val_text_tensor, val_text, val_passages, val_pos, val_pos_tensor, val_ent, val_ent_tensor, val_case_tensor): # if len(ori_sent) > 70: # print("sent %s is too long with %d words" % (tgt_passage.ID, len(ori_sent))) # try: # print(tgt_passage.ID) # print(tgt_passage) debugging_remote += 1 if debugging_remote < debugging_remote_min: continue if debugging_remote == debugging_remote_max: break # print(tgt_passage.ID) with torch.no_grad(): # try: pred_passage = evaluate_with_label(sent_tensor, model, a_model, label_model, s_model, rm_model, rm_lstm_model, ori_sent, tgt_passage, pos, pos_tensor, labels, label2index, ent, ent_tensor, case_tensor, unroll) # except Exception as e: # print(e) # print(tgt_passage.ID) # assert False if testing_phase: ioutil.write_passage(pred_passage, outdir="pred_test/") else: labeled, unlabeled, labeled_remote, unlabeled_remote = get_score(pred_passage, tgt_passage, testing, eval_type) total_labeled = tuple(map(operator.add, total_labeled, labeled)) total_unlabeled = tuple(map(operator.add, total_unlabeled, unlabeled)) total_labeled_remote = tuple(map(operator.add, total_labeled_remote, labeled_remote)) total_unlabeled_remote = tuple(map(operator.add, total_unlabeled_remote, unlabeled_remote)) if top_10_to_writeout < 10: ioutil.write_passage(pred_passage) top_10_to_writeout += 1 # except Exception as e: # print("Error: %s in passage: %s" % (e, tgt_passage.ID)) if testing_phase: return 100, 100, 100, 100 labeled_f1 = calculate_f1(total_labeled[0], total_labeled[1], total_labeled[2]) unlabeled_f1 = calculate_f1(total_unlabeled[0], total_unlabeled[1], total_unlabeled[2]) labeled_f1_remote = calculate_f1(total_labeled_remote[0], total_labeled_remote[1], total_labeled_remote[2]) unlabeled_f1_remote = calculate_f1(total_unlabeled_remote[0], total_unlabeled_remote[1], total_unlabeled_remote[2]) return labeled_f1, unlabeled_f1, labeled_f1_remote, unlabeled_f1_remote
def main(args): for spec in read_specs(args, converters=FROM_FORMAT): for passage in tqdm(spec.passages, unit=" passages", desc="Setting language in " + spec.out_dir, postfix={"lang": spec.lang}): passage.attrib["lang"] = spec.lang write_passage(passage, outdir=spec.out_dir, verbose=False, binary=args.binary)