def update(repos, verbose=True): ecoregions = [(er['properties']['eco_code'], shape(er['geometry'])) for er in jsonlib.load( data_file('ecoregions.json', repos=repos))['features'] if er['geometry'] and er['properties']['eco_code'] not in INVALID_ECO_CODES] with CsvData('distribution', repos=repos) as data: res = {i.id: i for i in data.items} occurrence_data = list( data_file('external', 'gbif', repos=repos).glob('*.json')) if verbose: # pragma: no cover occurrence_data = tqdm(occurrence_data) for fname in occurrence_data: sid = fname.stem d = res.get(sid, Distribution(sid, '', '')) if not d.countries__ids or not d.ecoregions__ids: occurrences = jsonlib.load(fname).get('results', []) if not d.ecoregions__ids: d.ecoregions__ids = list(match(occurrences, ecoregions)) if not d.countries__ids: d.countries__ids = list( r.get('countryCode') for r in occurrences) res[sid] = d data.items = [res[key] for key in sorted(res.keys())]
def upload_images(args): """ tsammalex upload_images path/to/cdstar/catalog """ images_path = data_file('images.csv', repos=args.tsammalex_data) staged_images_path = data_file('staged_images.csv', repos=args.tsammalex_data) checksums = set( d.id for d in models.CsvData('images', repos=args.tsammalex_data)) providers = [prov(args.tsammalex_data) for prov in PROVIDERS] with MediaCatalog('cdstar.json', repos=args.tsammalex_data, json_opts=dict(indent=4)) as mcat: with Catalog(args.args[0], cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: for item in models.CsvData('staged_images', repos=args.tsammalex_data): for provider in providers: if item in provider: img = provider.retrieve(item, cat, checksums, mcat) if img: try: add_rows(images_path, img.csv_row()) except: print(img) raise filter_rows(staged_images_path, lambda d: d['id'] != item.id) break
def upload_images(args): """ tsammalex upload_images path/to/cdstar/catalog """ images_path = data_file('images.csv', repos=args.tsammalex_data) staged_images_path = data_file('staged_images.csv', repos=args.tsammalex_data) checksums = set(d.id for d in models.CsvData('images', repos=args.tsammalex_data)) providers = [prov(args.tsammalex_data) for prov in PROVIDERS] with MediaCatalog( 'cdstar.json', repos=args.tsammalex_data, json_opts=dict(indent=4)) as mcat: with Catalog( args.args[0], cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: for item in models.CsvData('staged_images', repos=args.tsammalex_data): for provider in providers: if item in provider: img = provider.retrieve(item, cat, checksums, mcat) if img: try: add_rows(images_path, img.csv_row()) except: print(img) raise filter_rows(staged_images_path, lambda d: d['id'] != item.id) break
def test(): if not REPOS.exists(): return data = { n: OrderedDict([(item.id, item) for item in models.CsvData(n, on_error=error)]) for n in CSV } data['ecoregions'] = {} for ecoregion in jsonlib.load(data_file('ecoregions.json'))['features']: data['ecoregions'][ecoregion['properties']['eco_code']] = ecoregion data['refs'] = {} with data_file('sources.bib').open(encoding='utf8') as fp: for line in fp: match = BIB_ID_PATTERN.match(line.strip()) if match: data['refs'][match.group('id')] = 1 data['countries'] = {country.alpha2: country for country in countries} for name in ['names', 'taxa']: for line, item in enumerate(data[name].values()): for ref in item.refs__ids: if '[' in ref: source_id, pages = ref.split('[', 1) if not pages.endswith(']'): # pragma: no cover error('invalid reference %s' % (ref, ), name, line + 2) else: source_id = ref if source_id not in data['refs']: # pragma: no cover error('invalid id referenced: %s' % (source_id, ), name, line + 2) for name, model in [(n, getattr(models, n.capitalize())) for n in CSV]: for line, item in enumerate(data[name].values()): for col in [f.name for f in attr.fields(model)]: if '__' in col: ref, cardinality = col.split('__', 1) #if ref not in data: # continue ids = getattr(item, col) if cardinality == 'id': assert not isinstance(ids, list) ids = [ids] for v in ids: if ref not in data: raise ValueError(ref) # pragma: no cover if ref == 'refs' and '[' in v: v = v.split('[')[0] if v not in data[ref]: # pragma: no cover error('invalid %s id referenced: %s' % (ref, v), name, line + 2) if not SUCCESS: # pragma: no cover raise ValueError('integrity checks failed!')
def test(): if not REPOS.exists(): return data = { n: OrderedDict([(item.id, item) for item in models.CsvData(n, on_error=error)]) for n in CSV} data['ecoregions'] = {} for ecoregion in jsonlib.load(data_file('ecoregions.json'))['features']: data['ecoregions'][ecoregion['properties']['eco_code']] = ecoregion data['refs'] = {} with data_file('sources.bib').open(encoding='utf8') as fp: for line in fp: match = BIB_ID_PATTERN.match(line.strip()) if match: data['refs'][match.group('id')] = 1 data['countries'] = {country.alpha2: country for country in countries} for name in ['names', 'taxa']: for line, item in enumerate(data[name].values()): for ref in item.refs__ids: if '[' in ref: source_id, pages = ref.split('[', 1) if not pages.endswith(']'): # pragma: no cover error('invalid reference %s' % (ref,), name, line + 2) else: source_id = ref if source_id not in data['refs']: # pragma: no cover error('invalid id referenced: %s' % (source_id,), name, line + 2) for name, model in [(n, getattr(models, n.capitalize())) for n in CSV]: for line, item in enumerate(data[name].values()): for col in [f.name for f in attr.fields(model)]: if '__' in col: ref, cardinality = col.split('__', 1) #if ref not in data: # continue ids = getattr(item, col) if cardinality == 'id': assert not isinstance(ids, list) ids = [ids] for v in ids: if ref not in data: raise ValueError(ref) # pragma: no cover if ref == 'refs' and '[' in v: v = v.split('[')[0] if v not in data[ref]: # pragma: no cover error( 'invalid %s id referenced: %s' % (ref, v), name, line + 2) if not SUCCESS: # pragma: no cover raise ValueError('integrity checks failed!')
def test_JsonData(self): from pytsammalex.util import JsonData, data_file tmpdir = create_repos(self.tmp_path()) with JsonData('test.json', repos=tmpdir) as jdat: jdat['a'] = 1 self.assertTrue(data_file('test.json', repos=tmpdir).exists()) with JsonData('test.json', repos=tmpdir) as jdat: self.assertEqual(len(jdat), 1) self.assertEqual(jdat['a'], 1)
def test_json_data(tmpdir): tmp_ = create_repos(tmpdir) with JsonData('test.json', repos=Path(tmp_)) as jdat: jdat['a'] = 1 assert (data_file('test.json', repos=Path(tmp_)).exists() is True) with JsonData('test.json', repos=Path(tmp_)) as jdat: assert (len(jdat) == 1) assert (jdat['a'] == 1)
def cached_metadata(self, sid, id=None, name=None, refresh=False): if data_file('external', self.name, repos=self.repos).is_dir(): fname = data_file('external', self.name, sid + '.json', repos=self.repos) if not fname.exists() or refresh: try: data = self.metadata(id or self.identify(name)) except: # pragma: no cover data = None if not data: return # pragma: no cover jsonlib.dump(data, fname) return data return jsonlib.load(fname) if sid not in self.items or refresh: try: self.items[sid] = self.metadata(id or self.identify(name)) except: return return self.items[sid]
def update(repos, log): ecoregions = [ (er['properties']['eco_code'], shape(er['geometry'])) for er in jsonlib.load(data_file('ecoregions.json', repos=repos))['features'] if er['geometry'] and er['properties']['eco_code'] not in INVALID_ECO_CODES] with CsvData('distribution', repos=repos) as data: res = {i.id: i for i in data.items} occurrence_data = list(data_file('external', 'gbif', repos=repos).glob('*.json')) for fname in tqdm(occurrence_data): sid = fname.stem d = res.get(sid, Distribution(sid, '', '')) if not d.countries__ids or not d.ecoregions__ids: occurrences = jsonlib.load(fname).get('results', []) if not d.ecoregions__ids: d.ecoregions__ids = list(match(occurrences, ecoregions, log)) if not d.countries__ids: d.countries__ids = list(r.get('countryCode') for r in occurrences) res[sid] = d data.items = [res[key] for key in sorted(res.keys())]
def identify(self, item): p = data_file('staged_images', item.id, repos=self.repos) if p.exists(): return p