def dedup(): existing = [ i['id'] for i in csv_items('images.csv') if 'edmond' in i['source_url'] ] d = Deduplicator(existing) visit('cn/images.csv', d) print(d.count)
def update_taxa(): parser = argparse.ArgumentParser( description="""\ Update the supplemental data for taxa from external sources. We go through the taxa listed in taxa.csv and look for additional information at GBIF, EOL and Catalogue Of Life.""") parser.add_argument("--distribution-only", action="store_true") args = parser.parse_args() if not args.distribution_only: fname = data_file('taxa.json') taxa = jsonload(fname, default=[], object_pairs_hook=OrderedDict) ids = set(spec['id'] for spec in taxa) # add stubs for new entries in taxa.csv: for i, item in enumerate(csv_items('taxa.csv')): if item['id'] not in ids: taxa.insert(i, item2spec(item)) for cls in [CatalogueOfLife, GBIF, EOL]: with cls() as provider: for i, spec in enumerate(taxa): if i % 500 == 0: print(i) provider.update_taxon(spec) jsondump(taxa, fname, indent=4) main()
def update_taxa(): parser = argparse.ArgumentParser(description="""\ Update the supplemental data for taxa from external sources. We go through the taxa listed in taxa.csv and look for additional information at GBIF, EOL and Catalogue Of Life.""") parser.add_argument("--distribution-only", action="store_true") args = parser.parse_args() if not args.distribution_only: fname = data_file('taxa.json') taxa = jsonload(fname, default=[], object_pairs_hook=OrderedDict) ids = set(spec['id'] for spec in taxa) # add stubs for new entries in taxa.csv: for i, item in enumerate(csv_items('taxa.csv')): if item['id'] not in ids: taxa.insert(i, item2spec(item)) for cls in [CatalogueOfLife, GBIF, EOL]: with cls() as provider: for i, spec in enumerate(taxa): if i % 500 == 0: print(i) provider.update_taxon(spec) jsondump(taxa, fname, indent=4) main()
def check(p): count = 0 existing = [i['id'] for i in csv_items('cn/' + p) if 'edmond' in i['source_url']] for id, fname in [(n.split('.')[0], n) for n in os.listdir(data_file('cn/images'))]: if id in existing: count += 1 os.remove(data_file('cn', 'images', fname)) print(count)
def check(p): count = 0 existing = [ i['id'] for i in csv_items('cn/' + p) if 'edmond' in i['source_url'] ] for id, fname in [(n.split('.')[0], n) for n in os.listdir(data_file('cn/images'))]: if id in existing: count += 1 os.remove(data_file('cn', 'images', fname)) print(count)
def read_csv(name, unique='id'): uniquevalues = set() rows = [] for line, row in enumerate(csv_items(name)): line += 2 if unique: if unique not in row: error('unique key missing: %s' % unique, name, line) continue if row[unique] in uniquevalues: error('non-unique id: %s' % row[unique], name, line) uniquevalues.add(row[unique]) rows.append((line, row)) return rows
def update(p): data = jsonload(data_file('cn', 'images.json'), default={}) try: info = None for img in csv_items('cn/' + p): key = '%s-%s' % (img['taxa__id'], img['tags']) if key in data: print('+++', img['id'] or img['source'], data[key]['source']) continue info = get_image_info(img) if info: data[key] = get_image(info, data_file('cn', 'images')) except: print('----->') print(img) if info: print(info) jsondump(data, data_file('cn', 'images.json'), indent=4) raise jsondump(data, data_file('cn', 'images.json'), indent=4)
for key in 'kingdom phylum class order genus family'.split(): if result.get(key): taxon[key] = result[key] if 'taxonRank' in result: taxon['taxonRank'] = result['taxonRank'].lower() def save_occurrences(sid, sname): api = GBIF() out = data_file('external', 'gbif', '%s.json' % sid) if not os.path.exists(out): try: res = api.get_info(api.get_id(sname)) jsondump(res, out) print('%s: %s occurrences' % (sname, min([res['count'], res['limit']]))) except: # we'll have to try again next time! res = None else: try: res = jsonload(out) except: os.remove(out) res = None return res if __name__ == '__main__': for item in csv_items('taxa.csv'): save_occurrences(item['id'], item['scientific_name'])
def do_check(fname): existing = {(i['taxa__id'], i['tags']): i for i in csv_items('images.csv') if 'edmond' in i['source_url']} visit(fname, RemoveUploaded(existing))
if result.get(key): taxon[key] = result[key] if 'taxonRank' in result: taxon['taxonRank'] = result['taxonRank'].lower() def save_occurrences(sid, sname): api = GBIF() out = data_file('external', 'gbif', '%s.json' % sid) if not os.path.exists(out): try: res = api.get_info(api.get_id(sname)) jsondump(res, out) print('%s: %s occurrences' % (sname, min([res['count'], res['limit']]))) except: # we'll have to try again next time! res = None else: try: res = jsonload(out) except: os.remove(out) res = None return res if __name__ == '__main__': for item in csv_items('taxa.csv'): save_occurrences(item['id'], item['scientific_name'])
def dedup(): existing = [i['id'] for i in csv_items('images.csv') if 'edmond' in i['source_url']] d = Deduplicator(existing) visit('cn/images.csv', d) print(d.count)
('genus', item['genus'].capitalize() or None), ('ecoregions', split_ids(item.get('ecoregions__ids', ''))), ('countries', split_ids(item.get('countries__ids', ''))), ('wikipedia_url', wikipedia_url(item.get('wikipedia_url', ''))), ('eol_id', None), ('gbif_id', None), ('catalogueoflife_id', None), ]: spec[k] = v return spec if __name__ == '__main__': fname = data_file('taxa.json') taxa = jsonload(fname, default=[], object_pairs_hook=OrderedDict) ids = set(spec['id'] for spec in taxa) # add stubs for new entries in taxa.csv: for i, item in enumerate(csv_items('taxa.csv')): if item['id'] not in ids: taxa.insert(i, item2spec(item)) for cls in [CatalogueOfLife, GBIF, EOL]: with cls() as provider: for i, spec in enumerate(taxa): if i % 500 == 0: print(i) provider.update_taxon(spec) jsondump(taxa, fname, indent=4)