Ejemplo n.º 1
0
    def _decode_table_lines_ocr(ocr_prose):

        titles = []
        values = []

        for line in ocr_prose:

            # Fix some common number replacements in OCR
            line = line.replace('§', '5').replace('£', '[-')

            matches = TABLE_LINE_PARSE_RE.match(line)
            if not matches:
                continue
            groups = matches.groups()
            title = groups[0]
            try:
                value = (forgiving_float(groups[-3]),
                         forgiving_float(groups[-2]),
                         forgiving_float(groups[-1]))
                value = sanity_check_values(value)

                titles.append(title.replace("Cl", "CI"))
                values.append(value)
            except AttributeError:
                continue

        return titles, values
Ejemplo n.º 2
0
    def _decode_footer_summary_ocr(ocr_prose):

        lines = [x.strip() for x in ocr_prose.split('\n') if x.strip()]

        hetrogeneity = collections.OrderedDict()
        overall_effect = collections.OrderedDict()

        for line in lines:
            prefix = ""
            offset_h = line.find("Heterogeneity:")
            offset_t = line.find("Test for overall effect:")
            offset = 0
            if offset_h != -1:
                prefix = "Heterogeneity:"
                offset = offset_h
            elif offset_t != -1:
                prefix = "Test for overall effect:"
                offset = offset_t
            else:
                continue
            data = line[len(prefix) + offset:].strip()
            parts = PARTS_SPLIT_RE.split(data)

            for part in parts:
                match = PARTS_GROK_RE.match(part)
                if not match:
                    continue
                key, value = match.groups()
                key = key.strip()

                if prefix == "Heterogeneity:":
                    if key in ("7", "?", "F"):
                        key = "I"
                    possible_keys = difflib.get_close_matches(
                        key, HETROGENEITY_KEYS)
                    try:
                        hetrogeneity[possible_keys[0]] = forgiving_float(value)
                    except (IndexError, ValueError):
                        pass
                else:
                    possible_keys = difflib.get_close_matches(
                        key, OVERALL_EFFECT_KEYS)
                    try:
                        overall_effect[possible_keys[0]] = forgiving_float(
                            value)
                    except (IndexError, ValueError):
                        pass

        return hetrogeneity, overall_effect
Ejemplo n.º 3
0
    def _decode_table_values_ocr(ocr_prose):

        # Fix some common number replacements in OCR
        ocr_prose = ocr_prose.replace('§', '5').replace('$', '5').replace('£', '[-')

        parts = TABLE_VALUE_SPLIT_RE.split(ocr_prose)
        values = []
        for part in parts:
            try:
                groups = TABLE_VALUE_GROK_RE.match(part).groups()
                value = (forgiving_float(groups[0]), forgiving_float(groups[1]), forgiving_float(groups[2]))
                value = sanity_check_values(value)

                values.append(value)
            except (AttributeError, ValueError):
                if part == "(Excluded)":
                    values.append(("Excluded", "Excluded", "Excluded"))
        return values
Ejemplo n.º 4
0
    def _decode_footer_scale_ocr(ocr_prose):

        ocr_prose = ocr_prose.replace('§', '5').replace('$', '5').replace('£', '[-')
        lines = [x.strip() for x in ocr_prose.split('\n') if x.strip()]
        for line in lines:
            match = SCALE_RE.match(line)
            try:
                return forgiving_float(match.groups()[1])
            except (ValueError, AttributeError):
                pass
        raise ValueError
Ejemplo n.º 5
0
    def _decode_values_ocr(ocr_prose):

        # Fix some common number replacements in OCR
        ocr_prose = ocr_prose.replace('§', '5').replace('$', '5').replace('£', '[-')
        lines = [x.strip() for x in ocr_prose.split('\n') if x.strip()]

        values = []
        weights = []

        # first find the values
        for line in lines:
            parts = TABLE_VALUE_GROK_RE.split(line)
            if len(parts) == 5:
                try:
                    value = (forgiving_float(parts[1]), forgiving_float(parts[2]), forgiving_float(parts[3]))
                    value = sanity_check_values(value)
                    values.append(value)
                except ValueError:
                    if parts == "(Excluded)":
                        values.append(("Excluded", "Excluded", "Excluded"))
                try:
                    weight = forgiving_float(parts[4].strip())
                    weights.append(weight)
                except ValueError:
                    pass
            else:
                try:
                    weight = forgiving_float(line)
                    weights.append(weight)
                except ValueError:
                    pass

        if len(values) != len(weights):
            raise ValueError

        res = []
        for value, weight in zip(values, weights):
            res.append((value[0], value[1], value[2], weight))

        return res
Ejemplo n.º 6
0
    def _decode_footer_scale_ocr(ocr_prose):
        groups = None
        mid_scale = None

        ocr_prose = ocr_prose.replace('§',
                                      '5').replace('$',
                                                   '5').replace('£', '[-')
        lines = ocr_prose.split('\n')
        for line in lines:
            match = FAVOURS_RE.match(line)
            if match:
                groups = match.groups()
                continue
            match = SCALE_RE.match(line)
            if match:
                try:
                    mid_scale = forgiving_float(match.groups()[2])
                    continue
                except AttributeError:
                    pass

        return groups, mid_scale
Ejemplo n.º 7
0
 def test_forgiving_float_garbage(self):
     with self.assertRaises(ValueError):
         forgiving_float("hello")
Ejemplo n.º 8
0
    def _decode_table_columnwise_ocr(ocr_prose):

        return []

        metadata = []
        titles = []
        lines = [x.strip() for x in ocr_prose.split('\n') if x.strip()]

        for line in lines:

            # Fix some common number replacements in OCR
            line = line.replace('§', '5').replace('$', '5').replace('£', '[-')

            overall_match = OVERALL_LINE_RE.match(line)
            if not overall_match:
                titles.append(line)
            else:
                title, i_squared_str, probability_str = overall_match.groups()
                titles.append(title)
                metadata.append((forgiving_float(i_squared_str),
                                 forgiving_float(probability_str)))

                if title == "Overall":
                    break

        # having got the titles, now try to find the values
        values = ForestPlot._decode_table_values_ocr(ocr_prose)

        # now see if we can extract the weights from the last n lines
        weights = [forgiving_float(x) for x in lines[-1 * len(values):]]

        if len(metadata) == 1:
            if len(titles) != len(values):
                raise ValueError
            return [
                StataTableResults(titles, values, weights, metadata[0][0],
                                  metadata[0][1], 'Overall')
            ]

        # else we assume multichart

        plots = []

        while titles:
            sub_title = titles[0]
            if len(titles) > 1:
                titles = titles[1:]
            index = None
            try:
                index = titles.index('Subtotal')
            except ValueError:
                try:
                    index = titles.index('Overall')
                except ValueError:
                    pass
            if index is None:
                break

            sub_titles = titles[:index + 1]
            sub_values = values[:index + 1]
            sub_weights = weights[:index + 1]

            titles = titles[index + 1:]
            values = values[index + 1:]
            weights = weights[index + 1:]

            plots.append(
                StataTableResults(sub_titles, sub_values, sub_weights,
                                  metadata[0][0], metadata[0][1], sub_title))
            metadata = metadata[1:]

        return plots
Ejemplo n.º 9
0
    def _process_body(self):

        values_collection = self._process_values()
        titles_collection = self._process_titles()

        if not values_collection or not titles_collection:
            raise InvalidForestPlot

        values_count = len(values_collection[next(iter(values_collection))])

        # match the titles and value thresholds. Not sure this is necessary, but for now it simplifies things a little
        for threshold in range(50, 80, 2):
            if threshold in values_collection.keys() and threshold not in titles_collection.keys():
                del(values_collection[threshold])
            if threshold not in values_collection.keys() and threshold in titles_collection.keys():
                del(titles_collection[threshold])

        # work out how many groups we think there are, sanity checking against how many values we have
        group_counts = {threshold: len([x for x in titles_collection[threshold] if isinstance(x, tuple)]) for threshold in titles_collection}

        # further sanity check vs values
        clean_group_counts = {k: group_counts[k] for k in group_counts if len(titles_collection[k]) == values_count + (group_counts[k] - 1)}
        if not clean_group_counts:
            raise InvalidForestPlot
        raw_group_counts = list(clean_group_counts.values())
        most_common_groups = max(set(raw_group_counts), key=raw_group_counts.count)

        for threshold in clean_group_counts:
            if clean_group_counts[threshold] != most_common_groups:
                continue

            values = values_collection[threshold]
            titles = titles_collection[threshold]

            count = 0
            while count < most_common_groups:
                table = self.get_table(count)
                count += 1

                if count != most_common_groups:
                    table.add_title(titles[0])
                    titles = titles[1:]

                sub_titles = []
                sub_values = []

                while titles:
                    title = titles[0]
                    sub_titles.append(title)
                    if len(titles) > 1:
                        titles = titles[1:]
                    sub_values.append(values[0])
                    if len(values) > 1:
                        values = values[1:]

                    if isinstance(title, tuple):
                        break

                overall_title, i_squared_str, probability_str = sub_titles[-1]
                sub_titles[-1] = overall_title

                data = collections.OrderedDict(zip(sub_titles, sub_values))
                flattened_data = [(title, values[0], values[1], values[2], values[3]) for title, values in data.items()]
                table.add_data(flattened_data)
                table.metadata["i^2"] = i_squared_str
                try:
                    table.metadata["i^2"] = forgiving_float(i_squared_str)
                except ValueError:
                    pass
                table.metadata["p"] = probability_str
                try:
                    table.metadata["p"] = forgiving_float(probability_str)
                except ValueError:
                    pass