コード例 #1
0
ファイル: __init__.py プロジェクト: gianlucapuleri/nest
    def _tables_to_pkl(self):
        cea = pd.read_csv(self._gt_path('CEA'),
                          names=['tab_id', 'col_id', 'row_id', 'entities'],
                          dtype={
                              'tab_id': str,
                              'col_id': int,
                              'row_id': int,
                              'entities': str
                          })
        cea['entities'] = cea['entities'].apply(str.split)
        cta_groups = None
        if os.path.exists(self._gt_path('CTA')):
            cta = pd.read_csv(
                self._gt_path('CTA'),
                names=['tab_id', 'col_id', 'perfect', 'okay'],
                dtype={
                    'tab_id': str,
                    'col_id': int,
                    'perfect': str,
                    'okay': str
                },
                keep_default_na=False)  # the "okay" value might be empty
            cta['perfect'] = cta['perfect'].apply(str.split)
            cta['okay'] = cta['okay'].apply(str.split)
            cta_groups = cta.groupby('tab_id')
        cpa_groups = None
        if os.path.exists(self._gt_path('CPA')):
            cpa = pd.read_csv(
                self._gt_path('CPA'),
                names=['tab_id', 'source_id', 'target_id', 'properties'],
                dtype={
                    'tab_id': str,
                    'source_id': int,
                    'target_id': int,
                    'properties': str
                })
            cpa['properties'] = cpa['properties'].apply(str.split)
            cpa_groups = cpa.groupby('tab_id')

        cea_groups = cea.groupby('tab_id')
        for tab_id, cea_group in cea_groups:
            table = Table(tab_id, self.value, self._table_path(tab_id))
            table.set_gt_cell_annotations(
                zip(cea_group['row_id'], cea_group['col_id'],
                    cea_group['entities']))
            if cta_groups and tab_id in cta_groups.groups:
                cta_group = cta_groups.get_group(tab_id)
                table.set_gt_column_annotations(
                    zip(cta_group['col_id'], cta_group['perfect'],
                        cta_group['okay']))
            if cpa_groups and tab_id in cpa_groups.groups:
                cpa_group = cpa_groups.get_group(tab_id)
                table.set_gt_property_annotations(
                    zip(cpa_group['source_id'], cpa_group['target_id'],
                        cpa_group['properties']))

            pickle.dump(
                table,
                open(f"{self._pickle_table_folder_path()}/{table.tab_id}.pkl",
                     'wb'))
コード例 #2
0
ファイル: __init__.py プロジェクト: gianlucapuleri/nest
    def get_test_dataset(cls, size, from_dataset=None, rand=False):
        """
        Helper method to generate a test dataset on-the-fly.
        :param size: dimension of the test dataset to create (# cells)
        :param from_dataset: dataset to sample rows from. Default: Round1
        :param rand: True if the rows should be sampled randomly; otherwise, the top ``size`` rows are returned.
        :return: a Pandas dataframe
        """
        if from_dataset is None:
            from_dataset = cls.ST19_Round1
        cea = pd.read_csv(from_dataset._gt_path('CEA'),
                          names=['tab_id', 'col_id', 'row_id', 'entities'],
                          dtype={
                              'tab_id': str,
                              'col_id': int,
                              'row_id': int,
                              'entities': str
                          })
        if rand:
            cea = cea.sample(size).reset_index()
        else:
            cea = cea[:size]

        cta_groups = None
        if os.path.exists(from_dataset._gt_path('CTA')):
            cta = pd.read_csv(
                from_dataset._gt_path('CTA'),
                names=['tab_id', 'col_id', 'perfect', 'okay'],
                dtype={
                    'tab_id': str,
                    'col_id': int,
                    'perfect': str,
                    'okay': str
                },
                keep_default_na=False)  # the "okay" value might be empty
            cta['perfect'] = cta['perfect'].apply(str.split)
            cta['okay'] = cta['okay'].apply(str.split)
            cta_groups = cta.groupby('tab_id')
        cpa_groups = None
        if os.path.exists(from_dataset._gt_path('CPA')):
            cpa = pd.read_csv(
                from_dataset._gt_path('CPA'),
                names=['tab_id', 'source_id', 'target_id', 'properties'],
                dtype={
                    'tab_id': str,
                    'source_id': int,
                    'target_id': int,
                    'properties': str
                })
            cpa['properties'] = cpa['properties'].apply(str.split)
            cpa_groups = cpa.groupby('tab_id')

        cea_groups = cea.groupby('tab_id')
        tables = []
        for tab_id, cea_group in cea_groups:
            table = Table(tab_id, f'{from_dataset.value}_test',
                          from_dataset._table_path(tab_id))
            table.set_gt_cell_annotations(
                zip(cea_group['row_id'], cea_group['col_id'],
                    cea_group['entities']))
            if cta_groups and tab_id in cta_groups.groups:
                cta_group = cta_groups.get_group(tab_id)
                cta_group = cta_group[cta_group['col_id'].isin(
                    cea_group['col_id'].unique())]
                table.set_gt_column_annotations(
                    zip(cta_group['col_id'], cta_group['perfect'],
                        cta_group['okay']))
            if cpa_groups and tab_id in cpa_groups.groups:
                cpa_group = cpa_groups.get_group(tab_id)
                cpa_group = cpa_group[
                    (cpa_group['source_id'].isin(cea_group['col_id'].unique()))
                    & (cpa_group['target_id'].isin(
                        cea_group['col_id'].unique()))]
                table.set_gt_property_annotations(
                    zip(cpa_group['source_id'], cpa_group['target_id'],
                        cpa_group['properties']))

            tables.append(table)

        tmp = Enum('GTTestEnum',
                   {'%s_TEST_%d' % (from_dataset.name, size): tables
                    })  # create a temp enum
        setattr(tmp, 'get_tables', lambda x: x.value
                )  # add the get_df function, that returns the tables
        setattr(tmp, 'get_table_categories',
                lambda x: from_dataset.get_table_categories())
        setattr(tmp, 'total_tables', lambda x: len(tables))
        return list(tmp)[0]