Example #1
0
 def assert_sorted(self, entity_types: List[Type[CoreEntity]]):
     """Throws if the input |entity_types| list is not in descending order
     based on class hierarchy.
     """
     for type_1, type_2 in pairwise(entity_types):
         if not self.is_higher_ranked(type_1, type_2):
             raise ValueError(
                 f'Unexpected ordering, found {type_1.__name__} before '
                 f'{type_2.__name__}')
def _split_df(df: pd.DataFrame, indices: List[int]) -> List[pd.DataFrame]:
    """
    Split the fully parsed DataFrame into a new DataFrame based on |indices|.
    """
    end = None

    dfs = []
    for start, end in aggregate_ingest_utils.pairwise(indices):
        dfs.append(df[start:end].reset_index(drop=True))

    if end:
        dfs.append(df[end:].reset_index(drop=True))

    return dfs
Example #3
0
def _split_page(df: pd.DataFrame) -> Generator[pd.DataFrame, None, None]:
    """Create a new DataFrame for each facility listed on a page."""
    df = df.dropna(how='all')

    # bottom_df is parsed offset by one column and needs to be shifted
    last_column = df[df.columns[-1]]
    top_df = df[last_column.notnull()]
    bottom_df = df[last_column.isnull()].shift(1, axis='columns')

    # Recombine top_df and bottom_df since it's not the correct table division
    aligned_df = pd.concat([top_df, bottom_df], ignore_index=True)

    # New table starts when a new facility is listed
    table_starts = np.where(aligned_df['FACILITY'].notnull())[0]
    table_starts_and_end = numpy.append(table_starts, len(aligned_df))

    for start, end in aggregate_ingest_utils.pairwise(table_starts_and_end):
        yield aligned_df[start:end]