Example #1
0
def extract_number(digit_string):
   #function to get the output from tesseract and the output will be converted to float

    try:
        pred = float(digit_string.replace(" ", ""))
    except ValueError:
        scatteract_logger.get_logger().error("Output from tesseract is not a float " + digit_string)
        pred = None

    return pred
Example #2
0
def extract_number(digit_string):
    """
    Method used to clean up the output from tesseract.
    Inputs:
    digit_string (string): String output from tesseract.
    Outputs:
    pred (float) : If the input was a number, this is the float version of this number,
    else the output is None.
    """

    try:
        pred = float(digit_string.replace(" ", ""))
    except ValueError:
        scatteract_logger.get_logger().error(
            "Output from tesseract is not a float " + digit_string)
        pred = None

    return pred
Example #3
0
    def get_metrics(self, df_dict_pred, coord_idl, csv_output_dir = None, max_dist_perc = 2.0, quick = False):
        """
        Method which computes the precision and recall for all the plots.
        Inputs:
        df_dict_pred (dictionary): Dictionnary of dataframes wich contains the predicted points in label coordinates.
        coord_idl (string): Path to the idl files which contain the ground truth coordinates
        max_dist_perc (float): Maximum tolerated percentage difference between prediction and ground truth.
        quick (boolean): Boolean to decide weather to approximate the precision recall quantities.

        Outputs:
        df_prec_recall : (pandas dataframe) Pandas dataframe of precisions and recalls for each plot.
        """

        df_dict_true = parse_coords(coord_idl)
        precision_list, recall_list, image_name_list = [], [], []
        count_good = 0
        count_perfect = 0
        count_bad = 0
        metrics_logger = scatteract_logger.get_logger()

        for file_name in df_dict_pred:

            df_pred = df_dict_pred[file_name]
            df_true = df_dict_true.get(file_name,None)

            if df_true is not None:
                prec, rec = self.get_precision_recall(df_pred,df_true,
                                                      max_dist_perc = max_dist_perc,
                                                      norm = (df_true.max(axis=0)-df_true.min(axis=0)), quick = quick)
                precision_list.append(prec)
                recall_list.append(rec)
                image_name_list.append(file_name)
                if rec>=0.8 and prec>=0.8:
                    count_good+=1
                if rec==1.0 and prec==1.0:
                    count_perfect+=1
                if rec<=0.1 and prec<=0.1:
                    count_bad+=1
            else:
                metrics_logger.warn("No ground truth for :" + file_name)

        metrics_logger.info("Percentage of good extraction (recall and precision above 80%): {}".format(float(count_good)/len(precision_list)))
        metrics_logger.info("Percentage of perfect extraction (recall and precision at 100%): {}".format(float(count_perfect)/len(precision_list)))
        metrics_logger.info("Percentage of bad extraction (recall and precision below 10%): {}".format(float(count_bad)/len(precision_list)))
        metrics_logger.info("Precision: {}".format(np.mean(precision_list)))
        metrics_logger.info("Recall: {}".format(np.mean(recall_list)))
        metrics_logger.info("F1 score: {}".format(2*np.mean(recall_list)*np.mean(precision_list)/(np.mean(recall_list)+np.mean(precision_list))))

        df_prec_recall = pd.DataFrame({"image_name":image_name_list,"recall":recall_list,"precision":precision_list})
        if csv_output_dir is not None:
            df_prec_recall.to_csv(csv_output_dir + "/" + "precision_recall_list.csv")
            metrics_logger.debug("Saving a csv of precisions and recalls : {}".format(csv_output_dir + "/" + "precision_recall_list.csv"))

        return df_prec_recall
Example #4
0
    Example of command-line usage:

    TEST: (requires a dict of ground truth for ticks, labels and points bounding boxes, and an idl file for the ground truth coordinate)

    python scatter_extract.py --model_dict '{"ticks":"./output/lstm_rezoom_plot_ticks_2017_04_11_19.52", "labels":"./output/lstm_rezoom_plot_labels_2017_04_11_01.14","points":"./output/lstm_rezoom_plot_points_2017_04_14_15.58"}' \
    --iteration 125000 --image_dir data/plot_test/ --true_idl_dict '{"ticks":"./data/plot_test/ticks.idl","labels":"./data/plot_test/labels.idl", "points":"./data/plot_test/points.idl"}' \
    --image_output_dir image_output --csv_output_dir csv_output --true_coord_idl ./data/plot_test/coords.idl


    PREDICT: (requires an idl file which has a list of images to test on)

    python scatter_extract.py --model_dict '{"ticks":"./output/lstm_rezoom_plot_ticks_2017_04_11_19.52", "labels":"./output/lstm_rezoom_plot_labels_2017_04_11_01.14","points":"./output/lstm_rezoom_plot_points_2017_04_14_15.58"}' \
    --iteration 125000 --image_dir data/plots_real/ --predict_idl ./data/plots_real/test_real.idl --image_output_dir image_output --csv_output_dir csv_output
    """

    mylogger = scatteract_logger.get_logger()
    parser = argparse.ArgumentParser()

    parser.add_argument('--model_dict', help='Directory for the object detection models', required=True, type=json.loads)
    parser.add_argument('--iteration', help='Iteration number for the trained models', required=True)
    parser.add_argument('--image_dir', help='Directory of the images', required=True)
    parser.add_argument('--true_idl_dict', help='Path of the ground truth idls', required=False, type=json.loads, default=None)
    parser.add_argument('--predict_idl', help='Path of an idl file which list the images to predict on', required=False, default=None)
    parser.add_argument('--image_output_dir', help='Directory to output images with bounding boxes', required=True)
    parser.add_argument('--csv_output_dir', help='Directory to output csv of results', required=True)
    parser.add_argument('--true_coord_idl', help='Idl of the ground truth coordinates', required=False, default=None)
    parser.add_argument('--conf_threshold', help='Confidence threshold', required=False, default=0.3)
    parser.add_argument('--max_dist_perc', help='Maximum percent distance to be considered true positive', required=False, default=2.0)
    args = vars(parser.parse_args())

Example #5
0
    def get_metrics(self,
                    df_dict_pred,
                    coord_idl,
                    csv_output_dir=None,
                    max_dist_perc=2.0,
                    quick=False):
        # function for saving the Precision and recall in csv file.

        df_dict_true = read_coordinates(coord_idl)
        precision_list, recall_list, image_name_list = [], [], []
        count_good = 0
        count_perfect = 0
        count_bad = 0
        metrics_logger = scatteract_logger.get_logger()

        for name_f in df_dict_pred:

            df_pred = df_dict_pred[name_f]
            df_true = df_dict_true.get(name_f, None)

            if df_true is not None:
                prec, rec = self.get_precision_recall(
                    df_pred,
                    df_true,
                    max_dist_perc=max_dist_perc,
                    norm=(df_true.max(axis=0) - df_true.min(axis=0)),
                    quick=quick)
                precision_list.append(prec)
                recall_list.append(rec)
                image_name_list.append(name_f)
                if rec >= 0.8 and prec >= 0.8:
                    count_good += 1
                if rec == 1.0 and prec == 1.0:
                    count_perfect += 1
                if rec <= 0.1 and prec <= 0.1:
                    count_bad += 1
            else:
                metrics_logger.warn("No ground truth for :" + name_f)

        metrics_logger.info(
            "Percentage of good extraction (recall and precision above 80%): {}"
            .format(float(count_good) / len(precision_list)))
        metrics_logger.info(
            "Percentage of perfect extraction (recall and precision at 100%): {}"
            .format(float(count_perfect) / len(precision_list)))
        metrics_logger.info(
            "Percentage of bad extraction (recall and precision below 10%): {}"
            .format(float(count_bad) / len(precision_list)))
        metrics_logger.info("Precision: {}".format(np.mean(precision_list)))
        metrics_logger.info("Recall: {}".format(np.mean(recall_list)))
        metrics_logger.info("F1 score: {}".format(
            2 * np.mean(recall_list) * np.mean(precision_list) /
            (np.mean(recall_list) + np.mean(precision_list))))

        df_prec_recall = pd.DataFrame({
            "image_name": image_name_list,
            "recall": recall_list,
            "precision": precision_list
        })
        if csv_output_dir is not None:
            df_prec_recall.to_csv(csv_output_dir + "/" +
                                  "precision_recall_list.csv")
            metrics_logger.debug(
                "Saving a csv of precisions and recalls : {}".format(
                    csv_output_dir + "/" + "precision_recall_list.csv"))

        return df_prec_recall
Example #6
0
import pyocr
import pyocr.builders
import pandas as pd
import numpy as np
import cv2
from scipy import ndimage
import time
import argparse
import scatteract_logger

tools = pyocr.get_available_tools()
if len(tools) == 0:
    print("No OCR tool found")
    sys.exit(1)
tool = tools[0]
scatteract_logger.get_logger().info("Will use tool '%s'" % (tool.get_name()))


def compute_skew(image):
    """
    Method which computes the skew of a label.
    Inputs:
    image (numpy array): Numpy array of the image label.
    Output:
    angle (float) Angle needed to unskew the label.
    contours_len: Number of independent object detected
    (if the image is clean, this is the number of digitsminus signs, and points)
    """

    image = ImageOps.invert(image)