Esempio n. 1
0
def test_run_pipeline_locally_3_langs_with_comments():
    copy_and_clean_folder()
    preprocess(root,
               lang1,
               lang2,
               keep_comments,
               local=True,
               lang3=lang3,
               test_size=10,
               size_gb=0)
Esempio n. 2
0
def generate_batches(image_list, out_dir, ext='.bmp', batch_size=1000):

    prep_conf = yaml.load(
        open(abspath(join(dirname(__file__), 'config',
                          'preprocessing.yaml'))))['pipeline']
    prep_dir = make_sub_dir(out_dir, 'preprocessed')
    batches = [
        image_list[i:i + batch_size]
        for i in range(0, len(image_list), batch_size)
    ]

    for batch in batches:

        img_names, preprocessed_arr = [], []

        for seg_image in batch:

            img_name = splitext(basename(seg_image))[0]
            img_names.append(img_name)
            prep_out = join(prep_dir, img_name + ext)
            prep_img = preprocess(seg_image, prep_out, prep_conf)
            preprocessed_arr.append(prep_img)

        # Reshape array - samples, channels, height, width
        preprocessed_arr = np.asarray(preprocessed_arr).transpose((0, 3, 1, 2))
        yield preprocessed_arr, img_names
Esempio n. 3
0
def process_document(im, config, model, result_mapper):
    fsp = FormStructureParser(config)

    im = preprocess(im, config)
    form_data = fsp.process_form(im)
    form_data = recognize(form_data, model, result_mapper)

    return form_data
Esempio n. 4
0
def train(
    model: nn.Module,
    path: str = "data/ptb",
    batch_size: int = 32,
    epochs: int = 60,
    loggr: bool = False,
    name: str = "imle_net",
) -> None:
    """Data preprocessing and training of the model.

    Parameters
    ----------
    model: nn.Module
        Model to be trained.
    path: str, optional
        Path to the directory containing the data. (default: 'data/ptb')
    batch_size: int, optional
        Batch size. (default: 32)
    epochs: int, optional
        Number of epochs. (default: 60)
    loggr: bool, optional
        To log wandb metrics. (default: False)
    name: str, optional
        Name of the model. (default: 'imle_net')

    """

    X_train_scale, y_train, _, _, X_val_scale, y_val = preprocess(path=path)
    train_gen = DataGen(X_train_scale, y_train, batch_size=batch_size)
    val_gen = DataGen(X_val_scale, y_val, batch_size=batch_size)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_func = torch.nn.BCEWithLogitsLoss()

    best_score = 0.0
    for epoch in range(epochs):
        train_results = train_epoch(model,
                                    optimizer,
                                    loss_func,
                                    train_gen,
                                    epoch,
                                    device,
                                    loggr=loggr)
        test_results = test_epoch(model,
                                  loss_func,
                                  val_gen,
                                  epoch,
                                  device,
                                  loggr=loggr)

        if epoch > 5 and best_score < test_results[2]:
            save_path = os.path.join(os.getcwd(), "checkpoints/",
                                     f"{name}_weights.pt")
            torch.save(model.state_dict(), save_path)
            dump_logs(train_results, test_results, name)
Esempio n. 5
0
def train(
    model,
    path: str = "data/ptb",
    batch_size: int = 32,
    epochs: int = 60,
    loggr: bool = False,
    name: str = "imle_net",
) -> None:
    """Data preprocessing and training of the model.

    Parameters
    ----------
    model: tf.keras.Model
        Model to be trained.
    path: str, optional
        Path to the directory containing the data. (default: 'data/ptb')
    batch_size: int, optional
        Batch size. (default: 32)
    epochs: int, optional
        Number of epochs. (default: 60)
    loggr: bool, optional
        To log wandb metrics. (default: False)
    name: str, optional
        Name of the model. (default: 'imle_net')

    """

    X_train_scale, y_train, _, _, X_val_scale, y_val = preprocess(path=path)
    train_gen = DataGen(X_train_scale, y_train, batch_size=batch_size)
    val_gen = DataGen(X_val_scale, y_val, batch_size=batch_size)

    metric = "auc"
    checkpoint_filepath = os.path.join(os.getcwd(), "checkpoints/")
    os.makedir(checkpoint_filepath, exist_ok=True)

    checkpoint = model_checkpoint(checkpoint_filepath,
                                  val_gen,
                                  loggr=loggr,
                                  monitor=metric,
                                  name=name)
    stop_early = tf.keras.callbacks.EarlyStopping(
        monitor=metric,
        min_delta=0.001,
        patience=10,
        mode="auto",
        restore_best_weights=True,
    )

    callbacks = [checkpoint, stop_early]
    history = model.fit(
        train_gen,
        epochs=epochs,
        validation_data=val_gen,
        callbacks=callbacks,
        workers=5,
    )
    json_logs = os.path.join(os.getcwd(), f"logs/{name}_logs.json")
    json.dump(history.history, open(json_logs, "w"))
Esempio n. 6
0
def test(
    model: nn.Module,
    path: str = "data/ptb",
    batch_size: int = 32,
    name: str = "imle_net",
) -> None:
    """Data preprocessing and testing of the model.

    Parameters
    ----------
    model: nn.Module
        Model to be trained.
    path: str, optional
        Path to the directory containing the data. (default: 'data/ptb')
    batch_size: int, optional
        Batch size. (default: 32)
    name: str, optional
        Name of the model. (default: 'imle_net')

    """

    _, _, X_test_scale, y_test, _, _ = preprocess(path=path)
    test_gen = DataGen(X_test_scale, y_test, batch_size=batch_size)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    pred = epoch_run(model, test_gen, device, name)

    roc_score = roc_auc_score(y_test, pred, average="macro")
    acc, mean_acc = Metrics(y_test, pred)
    class_auc = AUC(y_test, pred)
    summary = metric_summary(y_test, pred)

    print(f"class wise accuracy: {acc}")
    print(f"accuracy: {mean_acc}")
    print(f"roc_score : {roc_score}")
    print(f"class wise AUC : {class_auc}")
    print(f"class wise precision, recall, f1 score : {summary}")

    logs = dict()
    logs["roc_score"] = roc_score
    logs["mean_acc"] = mean_acc
    logs["accuracy"] = acc
    logs["class_auc"] = class_auc
    logs["class_precision_recall_f1"] = summary
    logs_path = os.path.join(os.getcwd(), "logs", f"{name}_logs.json")
    json.dump(logs, open(logs_path, "w"))
Esempio n. 7
0
def read(path: str):
    """
    `path` to image file

    Errors
    ------
    
    IOError is raised when no image can be found under `path`.
    """
    image = cv2.imread(path)  # image is None if no image is found/ opened
    if image is None:
        raise IOError("Could not open image file under: {}".format(path))

    image = preprocess(image)
    text = ocr(image, language="pol")
    print(text)
    receipt = parse(text)
    return receipt
Esempio n. 8
0
def test(model,
         path: str = "data/ptb",
         batch_size: int = 32,
         name: str = "imle_net") -> None:
    """Testing the model and logging metrics.

    Parameters
    ----------
    model: tf.keras.Model
        Model to be trained.
    path: str, optional
        Path to the directory containing the data. (default: 'data/ptb')
    batch_size: int, optional
        Batch size. (default: 32)
    name: str, optional
        Name of the model. (default: 'imle_net')

    """

    _, _, X_test_scale, y_test, _, _ = preprocess(path=path)
    test_gen = DataGen(X_test_scale, y_test, batch_size=batch_size)

    pred = model.predict(test_gen[0][0])
    roc_score = roc_auc_score(y_test, pred, average="macro")
    acc, mean_acc = Metrics(y_test, pred)
    class_auc = AUC(y_test, pred)
    summary = metric_summary(y_test, pred)

    print(f"class wise accuracy: {acc}")
    print(f"accuracy: {mean_acc}")
    print(f"roc_score : {roc_score}")
    print(f"class wise AUC : {class_auc}")
    print(f"class wise precision, recall, f1 score : {summary}")

    logs = dict()
    logs["roc_score"] = roc_score
    logs["mean_acc"] = mean_acc
    logs["accuracy"] = acc
    logs["class_auc"] = class_auc
    logs["class_precision_recall_f1"] = summary
    logs_path = os.path.join(os.getcwd(), "logs", f"{name}_logs.json")
    json.dump(logs, open(logs_path, "w"))
Esempio n. 9
0
def read(path: str):
    """
    `path` to image file
    """
    image = cv2.imread(path)
    
    # cv2.imshow("original", image)
    # cv2.waitKey(0)

    image = preprocess(image)

    # cv2.imshow("processed", image)
    # cv2.waitKey(0)

    text = ocr(image)

    # print(text)

    receipt = parse(text)

    return receipt
Esempio n. 10
0
prev = 0

seen = dict()

while True:
    time_elapsed = t.time() - prev

    success, img = cap.read()

    if time_elapsed > 1. / frame_rate:
        prev = t.time()

        img_result = img.copy()
        img_corners = img.copy()

        processed_img = preprocess.preprocess(img)
        corners = process.find_contours(processed_img, img_corners)

        if corners:
            warped, matrix = process.warp_image(corners, img)
            warped_processed = preprocess.preprocess(warped)

            vertical_lines, horizontal_lines = process.get_grid_lines(
                warped_processed)
            mask = process.create_grid_mask(vertical_lines, horizontal_lines)
            numbers = cv2.bitwise_and(warped_processed, mask)

            squares = process.split_into_squares(numbers)
            squares_processed = process.clean_squares(squares)

            squares_guesses = process.recognize_digits(squares_processed,
Esempio n. 11
0
def query_model(query, model, indices, language, topk=100):
    query_embedding = model.get_query_representations([{'docstring_tokens': preprocess.preprocess(tokenize_docstring_from_string(query)),
                                                        'language': language}])[0]
    idxs, distances = indices.get_nns_by_vector(query_embedding, topk, include_distances=True)
    return idxs, distances
Esempio n. 12
0
 def preprocess(cls, data):
     return preprocess.preprocess(data)