def upload_images(args): """ tsammalex upload_images path/to/cdstar/catalog """ images_path = data_file('images.csv', repos=args.tsammalex_data) staged_images_path = data_file('staged_images.csv', repos=args.tsammalex_data) checksums = set( d.id for d in models.CsvData('images', repos=args.tsammalex_data)) providers = [prov(args.tsammalex_data) for prov in PROVIDERS] with MediaCatalog('cdstar.json', repos=args.tsammalex_data, json_opts=dict(indent=4)) as mcat: with Catalog(args.args[0], cdstar_url=os.environ['CDSTAR_URL'], cdstar_user=os.environ['CDSTAR_USER'], cdstar_pwd=os.environ['CDSTAR_PWD']) as cat: for item in models.CsvData('staged_images', repos=args.tsammalex_data): for provider in providers: if item in provider: img = provider.retrieve(item, cat, checksums, mcat) if img: try: add_rows(images_path, img.csv_row()) except: print(img) raise filter_rows(staged_images_path, lambda d: d['id'] != item.id) break
def test(): if not REPOS.exists(): return data = { n: OrderedDict([(item.id, item) for item in models.CsvData(n, on_error=error)]) for n in CSV } data['ecoregions'] = {} for ecoregion in jsonlib.load(data_file('ecoregions.json'))['features']: data['ecoregions'][ecoregion['properties']['eco_code']] = ecoregion data['refs'] = {} with data_file('sources.bib').open(encoding='utf8') as fp: for line in fp: match = BIB_ID_PATTERN.match(line.strip()) if match: data['refs'][match.group('id')] = 1 data['countries'] = {country.alpha2: country for country in countries} for name in ['names', 'taxa']: for line, item in enumerate(data[name].values()): for ref in item.refs__ids: if '[' in ref: source_id, pages = ref.split('[', 1) if not pages.endswith(']'): # pragma: no cover error('invalid reference %s' % (ref, ), name, line + 2) else: source_id = ref if source_id not in data['refs']: # pragma: no cover error('invalid id referenced: %s' % (source_id, ), name, line + 2) for name, model in [(n, getattr(models, n.capitalize())) for n in CSV]: for line, item in enumerate(data[name].values()): for col in [f.name for f in attr.fields(model)]: if '__' in col: ref, cardinality = col.split('__', 1) #if ref not in data: # continue ids = getattr(item, col) if cardinality == 'id': assert not isinstance(ids, list) ids = [ids] for v in ids: if ref not in data: raise ValueError(ref) # pragma: no cover if ref == 'refs' and '[' in v: v = v.split('[')[0] if v not in data[ref]: # pragma: no cover error('invalid %s id referenced: %s' % (ref, v), name, line + 2) if not SUCCESS: # pragma: no cover raise ValueError('integrity checks failed!')
def update_taxa(args): """ Update the supplemental data for taxa from external sources. We go through the taxa listed in taxa.csv and look for additional information at GBIF, EOL and Catalogue Of Life. """ with TaxaData(repos=args.tsammalex_data) as taxa: # add stubs for new entries in taxa.csv: for i, item in enumerate( models.CsvData('taxa', repos=args.tsammalex_data)): taxa.add(i, item) for cls in [CatalogueOfLife, GBIF, EOL]: print(cls.__name__) with cls(args.tsammalex_data) as provider: for spec in tqdm(taxa, leave=False): provider.update_taxon(spec)