Exemple #1
0
    def _decode_table_lines_ocr(ocr_prose):

        titles = []
        values = []

        for line in ocr_prose:

            # Fix some common number replacements in OCR
            line = line.replace('§', '5').replace('£', '[-')

            matches = TABLE_LINE_PARSE_RE.match(line)
            if not matches:
                continue
            groups = matches.groups()
            title = groups[0]
            try:
                value = (forgiving_float(groups[-3]),
                         forgiving_float(groups[-2]),
                         forgiving_float(groups[-1]))
                value = sanity_check_values(value)

                titles.append(title.replace("Cl", "CI"))
                values.append(value)
            except AttributeError:
                continue

        return titles, values
Exemple #2
0
    def _decode_table_values_ocr(ocr_prose):

        # Fix some common number replacements in OCR
        ocr_prose = ocr_prose.replace('§', '5').replace('$', '5').replace('£', '[-')

        parts = TABLE_VALUE_SPLIT_RE.split(ocr_prose)
        values = []
        for part in parts:
            try:
                groups = TABLE_VALUE_GROK_RE.match(part).groups()
                value = (forgiving_float(groups[0]), forgiving_float(groups[1]), forgiving_float(groups[2]))
                value = sanity_check_values(value)

                values.append(value)
            except (AttributeError, ValueError):
                if part == "(Excluded)":
                    values.append(("Excluded", "Excluded", "Excluded"))
        return values
Exemple #3
0
    def _decode_values_ocr(ocr_prose):

        # Fix some common number replacements in OCR
        ocr_prose = ocr_prose.replace('§', '5').replace('$', '5').replace('£', '[-')
        lines = [x.strip() for x in ocr_prose.split('\n') if x.strip()]

        values = []
        weights = []

        # first find the values
        for line in lines:
            parts = TABLE_VALUE_GROK_RE.split(line)
            if len(parts) == 5:
                try:
                    value = (forgiving_float(parts[1]), forgiving_float(parts[2]), forgiving_float(parts[3]))
                    value = sanity_check_values(value)
                    values.append(value)
                except ValueError:
                    if parts == "(Excluded)":
                        values.append(("Excluded", "Excluded", "Excluded"))
                try:
                    weight = forgiving_float(parts[4].strip())
                    weights.append(weight)
                except ValueError:
                    pass
            else:
                try:
                    weight = forgiving_float(line)
                    weights.append(weight)
                except ValueError:
                    pass

        if len(values) != len(weights):
            raise ValueError

        res = []
        for value, weight in zip(values, weights):
            res.append((value[0], value[1], value[2], weight))

        return res