def apply(args): print("APPLY") tkb = TheoremKB() session = Session() papers = tkb.list_papers(session) paper_ids = [str(p.id) for p in tqdm(papers)] extractor = tkb.extractors[args.extractor] tag_id = shortuuid.uuid() if args.name: tag_name = args.name else: tag_name = "from " + extractor.name tkb.add_layer_tag( session, tag_id, tag_name, False, {"extractor": { "name": extractor.name, "desc": extractor.description }}) session.commit() session.flush() session.close() args.func = None if args.single_core: for id in tqdm(paper_ids): process_paper((tag_id, id, args, extractor)) else: with Pool(7) as p: p.map(process_paper, [(tag_id, id, args, extractor) for id in paper_ids])
def summary(_): session = Session() tkb = TheoremKB() print(colored("# Layer:", attrs=["bold"])) for class_ in tkb.classes.values(): print("> ", class_.name, ": ", ",".join(class_.labels), sep="") print() print(colored("# Extractors:", attrs=["bold"])) for name, ex in tkb.extractors.items(): if isinstance(ex, TrainableExtractor): print( "> {:28}".format(name), colored(" (trained)", "green") if ex.is_trained else colored( "(untrained)", "red"), sep="", ) else: print("> ", name, sep="") print() print(colored("# Papers:", attrs=["bold"]), colored(len(tkb.list_papers(session)), "green")) print() print(colored("# Tags:", attrs=["bold"])) for tag, counts in tkb.count_layer_tags(session).values(): if tag.readonly: print(">", colored(f"{tag.name:28}", "yellow"), "| ", end="") else: print(f"> {tag.name:28}", "| ", end="") print((" " * 31 + "| ").join( [f"{n:12} -> {c:6}\n" for n, c in counts.items()])) print()
def process_paper(x: Tuple[str, str, str, Extractor]): (tag_id, paper_id, args, extractor) = x tkb = TheoremKB() session = Session() paper = tkb.get_paper(session, paper_id) for layer in paper.layers: if any( (tag.name == args.name for tag in layer.tags)) and layer.class_ == extractor.class_.name: print("skipped.", end="") return if paper.id in set(["1709.05182"]): return print(">>", paper_id) tag = tkb.get_layer_tag(session, tag_id) try: new_layer = extractor.apply_and_save(paper, [], args) if extractor.class_.name == "header": paper.title = "__undef__" new_layer.tags.append(tag) session.commit() session.close() except Exception: print(paper.id, "failed")
def process_paper(paper_id): session = Session() tkb = TheoremKB() paper = tkb.get_paper(session, paper_id) if paper.title == "__undef__": paper._refresh_title() session.commit() session.close()
def test(args, test_layer=None): print("TEST") tkb = TheoremKB() session = Session() if test_layer is None: test_layer = args.layer extractor = tkb.extractors[args.extractor] class_id = extractor.class_.name annotated_papers = [] for paper in tkb.list_papers(session): for layer in paper.layers: if any((tag.name == args.test_tag for tag in layer.tags)) and layer.class_ == class_id: annotated_papers.append((paper, layer)) break if args.n is not None: annotated_papers = annotated_papers[:args.n] def compare_layers(paper, true, pred): y = [] y_pred = [] for token in paper.get_xml().getroot().findall(f".//{ALTO}String"): bbx = BBX.from_element(token) y.append(true.get_label(bbx)) y_pred.append(pred.get_label(bbx)) return y, y_pred def test_paper(paper, layer, args): layer_pred = extractor.apply(paper, [], args) # todo: parameters. layer_true = paper.get_annotation_layer(layer.id) return compare_layers(paper, layer_true, layer_pred) args.func = None if args.single_core: res = [ test_paper(paper, layer, args) for paper, layer in tqdm(annotated_papers) ] else: res = Parallel(n_jobs=-1)(delayed(test_paper)(paper, layer, args) for paper, layer in tqdm(annotated_papers)) y, y_pred = [], [] for y_paper, y_pred_paper in res: y.extend(y_paper) y_pred.extend(y_pred_paper) sorted_labels = sorted(extractor.class_.labels) print( metrics.classification_report(y, y_pred, labels=sorted_labels, digits=3))
def test_tkb_initialization(tmpdir): global config config.DATA_PATH = tmpdir config.ENABLE_TENSORFLOW = True tkb = TheoremKB() session = config.Session() assert os.path.exists(f"{tmpdir}/tkb.sqlite") assert len(tkb.list_layer_tags(session)) == 0 assert len(tkb.list_papers(session)) == 0
def features(args): session = Session() tkb = TheoremKB() def process_paper(paper): try: paper._build_features(force=True) except Exception as e: print(paper.id, "failed:", e) Parallel(n_jobs=-1)(delayed(process_paper)(paper) for paper in tqdm(tkb.list_papers(session)))
def bench(args): print("BENCH") session = Session() tkb = TheoremKB() extractor = tkb.extractors[args.extractor] t0 = time.time() paper = tkb.get_paper(session, args.paper) extractor.apply_and_save(paper, "bench") t1 = time.time() print("Result: {:4f}".format(t1 - t0))
def cleanup(_): session = Session() tkb = TheoremKB() c = 0 for tag in tkb.list_layer_tags(session): if len(tag.layers) == 0 and not tag.readonly: session.delete(tag) c += 1 print(f"Removed {c} tags.") session.commit()
def remove_tag(args): print("REMOVE") tkb = TheoremKB() session = Session() count = 0 for tag in tkb.list_layer_tags(session): if tag.name == args.tag: session.delete(tag) count += 1 session.commit() print(f"Removed {count} tags.")
def title(args): session = Session() tkb = TheoremKB() def process_paper(paper_id): session = Session() tkb = TheoremKB() paper = tkb.get_paper(session, paper_id) if paper.title == "__undef__": paper._refresh_title() session.commit() session.close() paper_ids = [paper.id for paper in tkb.list_papers(session)] session.close() Parallel(n_jobs=1)(delayed(process_paper)(paper) for paper in tqdm(paper_ids))
def train(args): print("TRAIN") tkb = TheoremKB() session = Session() extractor = tkb.extractors[args.extractor] class_id = extractor.class_.name annotated_papers_train = [] for paper in tkb.list_papers(session): for layer in paper.layers: # TODO: select the most recent layer. if any((tag.name == args.train_tag for tag in layer.tags)) and layer.class_ == class_id: annotated_papers_train.append((paper, layer)) break annotated_papers_test = [] if args.val_layer is not None: for paper in tkb.list_papers(session): for layer in paper.layers: if (any((tag.name == args.val_tag for tag in layer.tags)) and layer.class_ == class_id): annotated_papers_test.append((paper, layer)) break if len(annotated_papers_train) == 0: print("No training layer found using this tag.") return print( f"Training data: {len(annotated_papers_train)} (train)/ {len(annotated_papers_test)} (test)" ) if isinstance(extractor, TrainableExtractor): extractor.train(annotated_papers_train, args) else: print("The chosen extractor is not trainable.") print("Trained! Testing..") print("Train results:") test(args, args.layer) print("Test results:") test(args, args.val_layer)
def register(args): print("REGISTER") tkb = TheoremKB() session = Session() added_papers = 0 for dirpath, _, filenames in tqdm(os.walk(args.path)): for paper_pdf in filenames: if not paper_pdf.lower().endswith(".pdf"): continue base_name = paper_pdf[:-4] pdf_dir = os.path.abspath(dirpath) + "/" + paper_pdf tkb.add_paper(session, base_name, pdf_dir) added_papers += 1 session.commit() print("Added", added_papers, "papers!")
def tkb(tmpdir): config.DATA_PATH = tmpdir config.ENABLE_TENSORFLOW = False tkb = TheoremKB() session = config.Session() paper = tkb.add_paper(session, "0", os.path.join(os.path.dirname(__file__), "../assets/dummy.pdf")) paper.title = "Dummy" paper2 = tkb.add_paper(session, "1", os.path.join(os.path.dirname(__file__), "../assets/dummy.pdf")) paper2.title = "another dummy paper" tag = tkb.add_layer_tag(session, "0", "tag", False, {}) lyr = paper.add_annotation_layer("segmentation") lyr.tags.append(tag) session.commit() return tkb, session
def split(args): print("SPLIT") tkb = TheoremKB() session = Session() if args.test + args.validation == 0: print("No split to do (test == 0 && validation == 0)") return tags = tkb.list_layer_tags(session) for tag in filter(lambda x: x.name == args.tag, tags): if args.test + args.validation > 0 and len(tag.layers) > 0: layers_train, layers_test = train_test_split(tag.layers, test_size=args.test + args.validation) if args.validation > 0: layers_val, layers_test = train_test_split( layers_test, test_size=args.test / (args.test + args.validation)) else: layers_val = [] else: layers_train = tag.layers layers_test, layers_val = [], [] for name, layers in [("train", layers_train), ("val", layers_val), ("test", layers_test)]: new_tag_id = shortuuid.uuid() if len(layers) == 0: # skip if no layers are needed. continue tag_db = tkb.add_layer_tag(session, new_tag_id, f"{tag.name} ({name})", False, {}) for layer in layers: layer.tags.append(tag_db) session.commit()
import sys, os sys.path.append(os.path.dirname(__file__) + "/../src/") from sqlalchemy.orm import Session from sqlalchemy.orm import scoped_session from sqlalchemy.orm import sessionmaker from tqdm import tqdm from lib.tkb import TheoremKB from lib.config import config session_factory = sessionmaker(bind=config.SQL_ENGINE) Session = scoped_session(session_factory) tkb = TheoremKB() session = Session() added_papers = 0 path = "/home/lucas/Downloads/pdf" for dirpath, _, filenames in tqdm(os.walk(path)): for paper_pdf in filenames: if not paper_pdf.lower().endswith(".pdf"): continue base_name = paper_pdf[:-4] pdf_dir = os.path.abspath(dirpath) + "/" + paper_pdf tkb.add_paper(session, base_name, pdf_dir) added_papers += 1 session.commit() print("Added", added_papers, "papers!")
[f"{n:12} -> {c:6}\n" for n, c in counts.items()])) print() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.set_defaults(func=summary) subparsers = parser.add_subparsers() # train parser_train = subparsers.add_parser("train") subparsers_train = parser_train.add_subparsers(dest="extractor") subparsers_train.required = True for extractor_name, extractor in TheoremKB().extractors.items(): if isinstance(extractor, TrainableExtractor): parser_extractor = subparsers_train.add_parser(extractor_name) extractor.add_args(parser_extractor) extractor.add_train_args(parser_extractor) parser_train.add_argument("train-tag", type=str, help="Take all layers that have given tag.") parser_train.add_argument("-v", "--val-tag", type=str, default=None, help="Use this tag for validation.") parser_train.add_argument("-n", type=int, default=None)
def info(args): tkb = TheoremKB() extractor = tkb.extractors[args.extractor] print(args.extractor, ":") print(extractor.description) extractor.info()
import sys, os sys.path.append(os.path.dirname(__file__) + "/../src/") from sqlalchemy.orm import Session from sqlalchemy.orm import scoped_session from sqlalchemy.orm import sessionmaker from tqdm import tqdm from lib.tkb import TheoremKB from lib.config import config from lib.misc.namespaces import ALTO session_factory = sessionmaker(bind=config.SQL_ENGINE) Session = scoped_session(session_factory) tkb = TheoremKB() session = Session() paper = tkb.list_papers(session)[0] features = paper.get_features(f"{ALTO}TextBlock", standardize=False, add_context=False) print("All columns:", features.columns) print("Sample line:", features.iloc[0])
annot.move_box(bbx_id, box) annot.save() resp.media = box.to_web(bbx_id, paper_id, layer_id) # refresh title. info = paper.get_annotation_info(layer_id) if info.class_ == "header": paper.title = "__undef__" session.commit() session.close() api = falcon.API() api.req_options.auto_parse_form_urlencoded = True tkb = TheoremKB() api.add_route("/classes/{class_id}", AnnotationClassResource(tkb)) api.add_route("/classes/{class_id}/extractors/{extractor_id}", AnnotationClassExtractorResource(tkb)) api.add_route("/tags/{tag_id}", LayerTagResource(tkb)) api.add_route("/papers/{paper_id}/layers/{layer_id}/tags/{tag_id}", LayerTagResource(tkb)) api.add_route("/papers/{paper_id}", PaperResource(tkb)) api.add_route("/papers/{paper_id}/pdf", PaperPDFResource(tkb)) api.add_route("/papers/{paper_id}/layers/{layer_id}", PaperAnnotationLayerResource(tkb)) api.add_route("/papers/{paper_id}/layers/{layer_id}/bbx/{bbx_id}", BoundingBoxResource(tkb))