def get_objects(self, uri, _oid=None, _start=None, _end=None, load_kwargs=None, **kwargs): ''' Load and transform csv data into a list of dictionaries. Each row in the csv will result in one dictionary in the list. :param uri: uri (file://, http(s)://) of csv file to load :param _oid: column or func to apply to map _oid in all resulting objects :param _start: column or func to apply to map _start in all resulting objects :param _end: column or func to apply to map _end in all resulting objects :param kwargs: kwargs to pass to pandas.read_csv method _start and _oid arguments can be a column name or a function which accepts a single argument -- the row being extracted. If either is a column name (string) then that column will be applied as _oid for each object generated. If either is a function, the function will be applied per each row and the result of the function will be assigned to the _start or _oid, respectively. ''' load_kwargs = load_kwargs or {} objects = load(path=uri, filetype='csv', **load_kwargs) k = itertools.count(1) now = utcnow() __oid = lambda o: k.next() _oid = _oid or __oid _start = _start or now _end = _end or None def is_callable(v): _v = type(v) _ = True if _v is type or hasattr(v, '__call__') else False return _ for obj in objects: obj['_oid'] = _oid(obj) if is_callable(_oid) else _oid obj['_start'] = _start(obj) if is_callable(_start) else _start obj['_end'] = _end(obj) if is_callable(_end) else _end self.container.add(obj) return super(Rows, self).get_objects(**kwargs)
def test_load_json(): ''' ''' from metrique import pyclient from metrique.utils import load name = 'meps' db_file = os.path.join(cache_dir, '%s.sqlite' % name) remove_file(db_file) def _oid_func(o): o['_oid'] = o['id'] return o m = pyclient(name=name) m.objects.drop() path = os.path.join(fixtures, 'meps.json') objects = load(path, _oid=_oid_func, orient='index') assert len(objects) == 736 m.objects.extend(objects) assert len(m.objects) # {u'phone_stb': u'+33 (0)3 88 1 75224', u'comms': None, u'country': # u'Latvia', u'_start': ... # u'_oid': 28615, u'name': u'Roberts Z\u012aLE', u'url': # u'http://www.europarl.euro...rs/expert/committees/view.do?id=28615', # u'_v': 0, u'phone_bxl': u'+32 (0)2 28 45224', u'_end': None, u'_hash': # u'e8d2a6943734a80f268d112514040b4707915181', u'__v__': u'0.3.1-1a', # u'party': u'European Conservatives and Reformists', u'_e': {}, u'_id': # u'28615', u'email': None} _hash = 'e8d2a6943734a80f268d112514040b4707915181' _filtered = m.objects.filter(where={'_oid': 28615}) assert len(_filtered) == 1 print 'Object: %s' % _filtered assert _filtered[0]['_hash'] == _hash _ids = m.objects.flush() assert sorted(_ids) == sorted(map(unicode, [o['_oid'] for o in objects])) assert m.objects == {} remove_file(db_file)
def test_load_json(): """ """ from metrique import pyclient from metrique.utils import load name = "meps" db_file = os.path.join(cache_dir, "%s.sqlite" % name) remove_file(db_file) def _oid_func(o): o["_oid"] = o["id"] return o m = pyclient(name=name) m.objects.drop() path = os.path.join(fixtures, "meps.json") objects = load(path, _oid=_oid_func, orient="index") assert len(objects) == 736 m.objects.extend(objects) assert len(m.objects) # {u'phone_stb': u'+33 (0)3 88 1 75224', u'comms': None, u'country': # u'Latvia', u'_start': ... # u'_oid': 28615, u'name': u'Roberts Z\u012aLE', u'url': # u'http://www.europarl.euro...rs/expert/committees/view.do?id=28615', # u'_v': 0, u'phone_bxl': u'+32 (0)2 28 45224', u'_end': None, u'_hash': # u'e8d2a6943734a80f268d112514040b4707915181', u'__v__': u'0.3.1-1a', # u'party': u'European Conservatives and Reformists', u'_e': {}, u'_id': # u'28615', u'email': None} _hash = "e8d2a6943734a80f268d112514040b4707915181" _filtered = m.objects.filter(where={"_oid": 28615}) assert len(_filtered) == 1 print "Object: %s" % _filtered assert _filtered[0]["_hash"] == _hash _ids = m.objects.flush() assert sorted(_ids) == sorted(map(unicode, [o["_oid"] for o in objects])) assert m.objects == {} remove_file(db_file)
def load(*args, **kwargs): ''' wrapper for utils.load automated data loader ''' return load(*args, **kwargs)
def test_load(): from metrique.utils import load path_glob = os.path.join(fixtures, 'test*.csv') x = load(path_glob, use_pandas=True) assert len(x) == 2 assert 'col_1' in x[0].keys() assert 1 in x[0].values() assert 100 in x[1].values() x = load(path_glob, _oid=True) assert '_oid' in x[0].keys() assert x[0]['_oid'] == 1 assert x[1]['_oid'] == 2 try: set_oid_func = 'i am a string, not a func' x = load(path_glob, _oid=set_oid_func) except TypeError: pass else: assert False set_oid_func = lambda o: dict(_oid=42, **o) x = load(path_glob, _oid=set_oid_func) assert x[0]['_oid'] == 42 assert x[1]['_oid'] == 42 # check that we can get a dataframe x = load(path_glob, as_df=True) assert hasattr(x, 'ix') # passing in a dataframe should return back the same dataframe... _x = load(x) assert _x is x try: # can load only files or dataframes load(1) except ValueError: pass else: assert False, "Loaded 1" empty = os.path.join(fixtures, 'empty.csv') try: load(empty, header=None, use_pandas=True) except ValueError: pass else: assert False try: load(empty, header=None, use_pandas=False) except RuntimeError: pass else: assert False header = os.path.join(fixtures, 'header_only.csv') try: load(header) except RuntimeError: pass try: load('DOES_NOT_EXIST') except IOError: pass else: assert False, "Loaded DOES_NOT_EXIST" # check that we can grab data from the web uri = 'https://mysafeinfo.com/api/data?list=days&format=csv' x = list(load(uri, filetype='csv')) assert len(x) == 7 x = load(path_glob)