def test_prune(self):
        ii = IngestInfo(people=[
            ingest_info.Person(),
            ingest_info.Person(bookings=[
                ingest_info.Booking(),
                ingest_info.Booking(
                    arrest=ingest_info.Arrest(),
                    charges=[
                        ingest_info.Charge(),
                        ingest_info.Charge(bond=ingest_info.Bond(),
                                           sentence=ingest_info.Sentence()),
                        ingest_info.Charge(bond=ingest_info.Bond(),
                                           sentence=ingest_info.Sentence(
                                               is_life='False'))
                    ],
                    holds=[ingest_info.Hold(),
                           ingest_info.Hold(hold_id=1)])
            ])
        ])

        expected = IngestInfo(people=[
            ingest_info.Person(bookings=[
                ingest_info.Booking(
                    charges=[
                        ingest_info.Charge(sentence=ingest_info.Sentence(
                            is_life='False'))
                    ],
                    holds=[
                        ingest_info.Hold(jurisdiction_name='UNSPECIFIED'),
                        ingest_info.Hold(hold_id=1,
                                         jurisdiction_name='UNSPECIFIED')
                    ])
            ])
        ])
        self.assertEqual(ii.prune(), expected)
Ejemplo n.º 2
0
    def extract_and_populate_data(self,
                                  content: Union[Dict, List],
                                  ingest_info: IngestInfo = None):
        """This function does all the work of taking the users yaml file
        and content and returning a populated data class.  This function
        iterates through every field in the object and builds a model based on
        the keys that it sees.

        Args:
            content: An already parsed JSON object or array
            ingest_info: An IngestInfo object to use, if None we create a new
                one by default

        Returns:
            A populated ingest data model for a scrape.
        """
        if ingest_info is None:
            ingest_info = IngestInfo()
        self._extract(content, ingest_info, defaultdict(set))
        return ingest_info.prune()
    def extract_and_populate_data(self,
                                  content: Union[str, Iterable[str]],
                                  ingest_info: IngestInfo = None) -> IngestInfo:
        """This function does all the work of taking the users yaml file
        and content and returning a populated data class.  This function
        iterates through every field in the object and builds a model based on
        the keys that it sees.

        Args:
            content: CSV-formatted text (Either a string with the full file
                contents, or an Interable where each element is a single line of
                contents. Not a file object.)
            ingest_info: An IngestInfo object to use, if None we create a new
                one by default

        Returns:
            A populated ingest data model for a scrape.
        """
        if ingest_info is None:
            ingest_info = IngestInfo()
        self._extract(content, ingest_info)
        self._run_file_post_hooks(ingest_info)
        return ingest_info.prune()
Ejemplo n.º 4
0
    def extract_and_populate_data(
        self,
        content: HtmlElement,
        ingest_info: IngestInfo = None,
        search_for_keys: bool = True,
    ) -> IngestInfo:
        """This function does all the work of taking the users yaml file
        and content and returning a populated data class.  This function
        iterates through every cell on the page and builds a model based on
        the keys that it sees.

        Args:
            content: An already parsed html data structure
            ingest_info: An IngestInfo object to use, if None we create a new
                one by default
            search_for_keys: Flag to allow searching for keys outside of
            table cells (<td> and <tr> elements).

        Returns:
            A populated ingest data model for a scrape.
        """
        content_copy = copy.deepcopy(content)
        HtmlDataExtractor._process_html(content_copy)
        self._set_all_cells(content_copy, search_for_keys)
        if ingest_info is None:
            ingest_info = IngestInfo()
        seen_map: Dict[int, Set[str]] = defaultdict(set)

        # We use this set to keep track of keys we have seen, by the end of this
        # function it should be the empty set.  If not we throw an error to let
        # the user know we have a problem.
        needed_keys = set(self.keys.keys()) | set(self.multi_keys.keys())

        for cell in self.cells:
            # This is a tiny hack to avoid an O(n) search over the keys list for
            # every cell.
            # An alternative approach is to force the user to give the exact key
            # with a semi colon in the yaml file, but that might be confusing.
            # Finally, we could preprocess the keys mapping to include multiple
            # keys that map to the same value ('hi' and 'hi:' both map to the
            # same thing) but that is a more expensive preprocessing calculation
            cell_val = self._normalize_cell(cell)
            lookup_keys = self.keys.get(cell_val) or self.multi_keys.get(
                cell_val)
            if not lookup_keys:
                # Users can specify a key with no value associated and then use
                # |get_value()| later. We shouldn't warn, even though we
                # won't find values for these keys.
                if cell_val in needed_keys:
                    needed_keys.remove(cell_val)
                continue
            values: List[Optional[str]] = []
            if cell_val in self.keys:
                values = [self._get_value_cell(cell)]
            elif cell_val in self.multi_keys:
                values = self._get_values_below_cell(cell)
            if values:
                self._set_or_create_object(ingest_info, lookup_keys, values,
                                           seen_map)
                if cell_val in needed_keys:
                    needed_keys.remove(cell_val)
        # If at the end of everything there are some keys we haven't found on
        # page we should complain.
        if needed_keys:
            logging.debug("The following keys could not be found: %s",
                          needed_keys)
        return ingest_info.prune()