def _decode_table_lines_ocr(ocr_prose): titles = [] values = [] for line in ocr_prose: # Fix some common number replacements in OCR line = line.replace('§', '5').replace('£', '[-') matches = TABLE_LINE_PARSE_RE.match(line) if not matches: continue groups = matches.groups() title = groups[0] try: value = (forgiving_float(groups[-3]), forgiving_float(groups[-2]), forgiving_float(groups[-1])) value = sanity_check_values(value) titles.append(title.replace("Cl", "CI")) values.append(value) except AttributeError: continue return titles, values
def _decode_table_values_ocr(ocr_prose): # Fix some common number replacements in OCR ocr_prose = ocr_prose.replace('§', '5').replace('$', '5').replace('£', '[-') parts = TABLE_VALUE_SPLIT_RE.split(ocr_prose) values = [] for part in parts: try: groups = TABLE_VALUE_GROK_RE.match(part).groups() value = (forgiving_float(groups[0]), forgiving_float(groups[1]), forgiving_float(groups[2])) value = sanity_check_values(value) values.append(value) except (AttributeError, ValueError): if part == "(Excluded)": values.append(("Excluded", "Excluded", "Excluded")) return values
def _decode_values_ocr(ocr_prose): # Fix some common number replacements in OCR ocr_prose = ocr_prose.replace('§', '5').replace('$', '5').replace('£', '[-') lines = [x.strip() for x in ocr_prose.split('\n') if x.strip()] values = [] weights = [] # first find the values for line in lines: parts = TABLE_VALUE_GROK_RE.split(line) if len(parts) == 5: try: value = (forgiving_float(parts[1]), forgiving_float(parts[2]), forgiving_float(parts[3])) value = sanity_check_values(value) values.append(value) except ValueError: if parts == "(Excluded)": values.append(("Excluded", "Excluded", "Excluded")) try: weight = forgiving_float(parts[4].strip()) weights.append(weight) except ValueError: pass else: try: weight = forgiving_float(line) weights.append(weight) except ValueError: pass if len(values) != len(weights): raise ValueError res = [] for value, weight in zip(values, weights): res.append((value[0], value[1], value[2], weight)) return res