Esempio n. 1
0
def _ses2pandas(ses, dtypes=None):
    """
    :param ses: session dictionary from rest endpoint
    :param dtypes: list of dataset types
    :return:
    """
    # selection: get relevant dtypes only if there is an url associated
    rec = list(filter(lambda x: x['url'], ses['data_dataset_session_related']))
    if dtypes == ['__all__'] or dtypes == '__all__':
        dtypes = None
    if dtypes is not None:
        rec = list(filter(lambda x: x['dataset_type'] in dtypes, rec))
    include = ['id', 'hash', 'dataset_type', 'name', 'file_size', 'collection']
    uuid_fields = ['id', 'eid']
    join = {
        'subject': ses['subject'],
        'lab': ses['lab'],
        'eid': ses['url'][-36:],
        'start_time': np.datetime64(ses['start_time']),
        'number': ses['number'],
        'task_protocol': ses['task_protocol']
    }
    col = parquet.rec2col(rec,
                          include=include,
                          uuid_fields=uuid_fields,
                          join=join,
                          types={
                              'file_size': np.double
                          }).to_df()
    return col
Esempio n. 2
0
    def test_rec2col(self):
        json_fixture = Path(__file__).parent.joinpath('fixtures',
                                                      'parquet_records.json')
        with open(json_fixture, 'r') as fid:
            datasets = json.loads(fid.read())
        # test with includes / joins and uuid fields in both join and includes
        include = [
            'id', 'hash', 'dataset_type', 'name', 'file_size', 'collection'
        ]
        uuid_fields = ['id', 'eid']
        join = {
            'subject': 'Bernard',
            'lab': 'thelab',
            'eid': '150f92bc-e755-4f54-96c1-84e1eaf832b4'
        }
        arr = rec2col(datasets,
                      include=include,
                      uuid_fields=uuid_fields,
                      join=join)
        self.assertTrue(np.all(np.array([arr[k].size for k in arr]) == 5))
        self.assertTrue(
            len(arr.keys()) == len(include) + len(uuid_fields) +
            len(join.keys()))
        # test single dictionary
        arr_single = rec2col(datasets[0],
                             include=include,
                             uuid_fields=uuid_fields,
                             join=join)
        self.assertTrue(np.all(arr.to_df().iloc[0] == arr_single.to_df()))
        # test empty
        arr_empty = rec2col([],
                            include=include,
                            uuid_fields=uuid_fields,
                            join=join)
        self.assertTrue(arr_empty.to_df().size == 0)

        # the empty float fields should be serialized as NaNs when coerced into double
        [ds.update({'float_field': None}) for ds in datasets]
        arr = rec2col(datasets,
                      uuid_fields=uuid_fields,
                      join=join,
                      types={'float_field': np.double})
        self.assertTrue(np.all(np.isnan(arr['float_field'])))