def test_run_pipeline_locally_3_langs_with_comments(): copy_and_clean_folder() preprocess(root, lang1, lang2, keep_comments, local=True, lang3=lang3, test_size=10, size_gb=0)
def generate_batches(image_list, out_dir, ext='.bmp', batch_size=1000): prep_conf = yaml.load( open(abspath(join(dirname(__file__), 'config', 'preprocessing.yaml'))))['pipeline'] prep_dir = make_sub_dir(out_dir, 'preprocessed') batches = [ image_list[i:i + batch_size] for i in range(0, len(image_list), batch_size) ] for batch in batches: img_names, preprocessed_arr = [], [] for seg_image in batch: img_name = splitext(basename(seg_image))[0] img_names.append(img_name) prep_out = join(prep_dir, img_name + ext) prep_img = preprocess(seg_image, prep_out, prep_conf) preprocessed_arr.append(prep_img) # Reshape array - samples, channels, height, width preprocessed_arr = np.asarray(preprocessed_arr).transpose((0, 3, 1, 2)) yield preprocessed_arr, img_names
def process_document(im, config, model, result_mapper): fsp = FormStructureParser(config) im = preprocess(im, config) form_data = fsp.process_form(im) form_data = recognize(form_data, model, result_mapper) return form_data
def train( model: nn.Module, path: str = "data/ptb", batch_size: int = 32, epochs: int = 60, loggr: bool = False, name: str = "imle_net", ) -> None: """Data preprocessing and training of the model. Parameters ---------- model: nn.Module Model to be trained. path: str, optional Path to the directory containing the data. (default: 'data/ptb') batch_size: int, optional Batch size. (default: 32) epochs: int, optional Number of epochs. (default: 60) loggr: bool, optional To log wandb metrics. (default: False) name: str, optional Name of the model. (default: 'imle_net') """ X_train_scale, y_train, _, _, X_val_scale, y_val = preprocess(path=path) train_gen = DataGen(X_train_scale, y_train, batch_size=batch_size) val_gen = DataGen(X_val_scale, y_val, batch_size=batch_size) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) loss_func = torch.nn.BCEWithLogitsLoss() best_score = 0.0 for epoch in range(epochs): train_results = train_epoch(model, optimizer, loss_func, train_gen, epoch, device, loggr=loggr) test_results = test_epoch(model, loss_func, val_gen, epoch, device, loggr=loggr) if epoch > 5 and best_score < test_results[2]: save_path = os.path.join(os.getcwd(), "checkpoints/", f"{name}_weights.pt") torch.save(model.state_dict(), save_path) dump_logs(train_results, test_results, name)
def train( model, path: str = "data/ptb", batch_size: int = 32, epochs: int = 60, loggr: bool = False, name: str = "imle_net", ) -> None: """Data preprocessing and training of the model. Parameters ---------- model: tf.keras.Model Model to be trained. path: str, optional Path to the directory containing the data. (default: 'data/ptb') batch_size: int, optional Batch size. (default: 32) epochs: int, optional Number of epochs. (default: 60) loggr: bool, optional To log wandb metrics. (default: False) name: str, optional Name of the model. (default: 'imle_net') """ X_train_scale, y_train, _, _, X_val_scale, y_val = preprocess(path=path) train_gen = DataGen(X_train_scale, y_train, batch_size=batch_size) val_gen = DataGen(X_val_scale, y_val, batch_size=batch_size) metric = "auc" checkpoint_filepath = os.path.join(os.getcwd(), "checkpoints/") os.makedir(checkpoint_filepath, exist_ok=True) checkpoint = model_checkpoint(checkpoint_filepath, val_gen, loggr=loggr, monitor=metric, name=name) stop_early = tf.keras.callbacks.EarlyStopping( monitor=metric, min_delta=0.001, patience=10, mode="auto", restore_best_weights=True, ) callbacks = [checkpoint, stop_early] history = model.fit( train_gen, epochs=epochs, validation_data=val_gen, callbacks=callbacks, workers=5, ) json_logs = os.path.join(os.getcwd(), f"logs/{name}_logs.json") json.dump(history.history, open(json_logs, "w"))
def test( model: nn.Module, path: str = "data/ptb", batch_size: int = 32, name: str = "imle_net", ) -> None: """Data preprocessing and testing of the model. Parameters ---------- model: nn.Module Model to be trained. path: str, optional Path to the directory containing the data. (default: 'data/ptb') batch_size: int, optional Batch size. (default: 32) name: str, optional Name of the model. (default: 'imle_net') """ _, _, X_test_scale, y_test, _, _ = preprocess(path=path) test_gen = DataGen(X_test_scale, y_test, batch_size=batch_size) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") pred = epoch_run(model, test_gen, device, name) roc_score = roc_auc_score(y_test, pred, average="macro") acc, mean_acc = Metrics(y_test, pred) class_auc = AUC(y_test, pred) summary = metric_summary(y_test, pred) print(f"class wise accuracy: {acc}") print(f"accuracy: {mean_acc}") print(f"roc_score : {roc_score}") print(f"class wise AUC : {class_auc}") print(f"class wise precision, recall, f1 score : {summary}") logs = dict() logs["roc_score"] = roc_score logs["mean_acc"] = mean_acc logs["accuracy"] = acc logs["class_auc"] = class_auc logs["class_precision_recall_f1"] = summary logs_path = os.path.join(os.getcwd(), "logs", f"{name}_logs.json") json.dump(logs, open(logs_path, "w"))
def read(path: str): """ `path` to image file Errors ------ IOError is raised when no image can be found under `path`. """ image = cv2.imread(path) # image is None if no image is found/ opened if image is None: raise IOError("Could not open image file under: {}".format(path)) image = preprocess(image) text = ocr(image, language="pol") print(text) receipt = parse(text) return receipt
def test(model, path: str = "data/ptb", batch_size: int = 32, name: str = "imle_net") -> None: """Testing the model and logging metrics. Parameters ---------- model: tf.keras.Model Model to be trained. path: str, optional Path to the directory containing the data. (default: 'data/ptb') batch_size: int, optional Batch size. (default: 32) name: str, optional Name of the model. (default: 'imle_net') """ _, _, X_test_scale, y_test, _, _ = preprocess(path=path) test_gen = DataGen(X_test_scale, y_test, batch_size=batch_size) pred = model.predict(test_gen[0][0]) roc_score = roc_auc_score(y_test, pred, average="macro") acc, mean_acc = Metrics(y_test, pred) class_auc = AUC(y_test, pred) summary = metric_summary(y_test, pred) print(f"class wise accuracy: {acc}") print(f"accuracy: {mean_acc}") print(f"roc_score : {roc_score}") print(f"class wise AUC : {class_auc}") print(f"class wise precision, recall, f1 score : {summary}") logs = dict() logs["roc_score"] = roc_score logs["mean_acc"] = mean_acc logs["accuracy"] = acc logs["class_auc"] = class_auc logs["class_precision_recall_f1"] = summary logs_path = os.path.join(os.getcwd(), "logs", f"{name}_logs.json") json.dump(logs, open(logs_path, "w"))
def read(path: str): """ `path` to image file """ image = cv2.imread(path) # cv2.imshow("original", image) # cv2.waitKey(0) image = preprocess(image) # cv2.imshow("processed", image) # cv2.waitKey(0) text = ocr(image) # print(text) receipt = parse(text) return receipt
prev = 0 seen = dict() while True: time_elapsed = t.time() - prev success, img = cap.read() if time_elapsed > 1. / frame_rate: prev = t.time() img_result = img.copy() img_corners = img.copy() processed_img = preprocess.preprocess(img) corners = process.find_contours(processed_img, img_corners) if corners: warped, matrix = process.warp_image(corners, img) warped_processed = preprocess.preprocess(warped) vertical_lines, horizontal_lines = process.get_grid_lines( warped_processed) mask = process.create_grid_mask(vertical_lines, horizontal_lines) numbers = cv2.bitwise_and(warped_processed, mask) squares = process.split_into_squares(numbers) squares_processed = process.clean_squares(squares) squares_guesses = process.recognize_digits(squares_processed,
def query_model(query, model, indices, language, topk=100): query_embedding = model.get_query_representations([{'docstring_tokens': preprocess.preprocess(tokenize_docstring_from_string(query)), 'language': language}])[0] idxs, distances = indices.get_nns_by_vector(query_embedding, topk, include_distances=True) return idxs, distances
def preprocess(cls, data): return preprocess.preprocess(data)