Beispiel #1
0
def dedup():
    existing = [
        i['id'] for i in csv_items('images.csv') if 'edmond' in i['source_url']
    ]
    d = Deduplicator(existing)
    visit('cn/images.csv', d)
    print(d.count)
Beispiel #2
0
def update_taxa():
    parser = argparse.ArgumentParser(
        description="""\
Update the supplemental data for taxa from external sources.

We go through the taxa listed in taxa.csv and look for additional information at
GBIF, EOL and Catalogue Of Life.""")
    parser.add_argument("--distribution-only", action="store_true")
    args = parser.parse_args()

    if not args.distribution_only:
        fname = data_file('taxa.json')
        taxa = jsonload(fname, default=[], object_pairs_hook=OrderedDict)
        ids = set(spec['id'] for spec in taxa)

        # add stubs for new entries in taxa.csv:
        for i, item in enumerate(csv_items('taxa.csv')):
            if item['id'] not in ids:
                taxa.insert(i, item2spec(item))

        for cls in [CatalogueOfLife, GBIF, EOL]:
            with cls() as provider:
                for i, spec in enumerate(taxa):
                    if i % 500 == 0:
                        print(i)
                    provider.update_taxon(spec)

        jsondump(taxa, fname, indent=4)

    main()
Beispiel #3
0
def update_taxa():
    parser = argparse.ArgumentParser(description="""\
Update the supplemental data for taxa from external sources.

We go through the taxa listed in taxa.csv and look for additional information at
GBIF, EOL and Catalogue Of Life.""")
    parser.add_argument("--distribution-only", action="store_true")
    args = parser.parse_args()

    if not args.distribution_only:
        fname = data_file('taxa.json')
        taxa = jsonload(fname, default=[], object_pairs_hook=OrderedDict)
        ids = set(spec['id'] for spec in taxa)

        # add stubs for new entries in taxa.csv:
        for i, item in enumerate(csv_items('taxa.csv')):
            if item['id'] not in ids:
                taxa.insert(i, item2spec(item))

        for cls in [CatalogueOfLife, GBIF, EOL]:
            with cls() as provider:
                for i, spec in enumerate(taxa):
                    if i % 500 == 0:
                        print(i)
                    provider.update_taxon(spec)

        jsondump(taxa, fname, indent=4)

    main()
Beispiel #4
0
def check(p):
    count = 0
    existing = [i['id'] for i in csv_items('cn/' + p) if 'edmond' in i['source_url']]
    for id, fname in [(n.split('.')[0], n) for n in os.listdir(data_file('cn/images'))]:
        if id in existing:
            count += 1
            os.remove(data_file('cn', 'images', fname))
    print(count)
Beispiel #5
0
def check(p):
    count = 0
    existing = [
        i['id'] for i in csv_items('cn/' + p) if 'edmond' in i['source_url']
    ]
    for id, fname in [(n.split('.')[0], n)
                      for n in os.listdir(data_file('cn/images'))]:
        if id in existing:
            count += 1
            os.remove(data_file('cn', 'images', fname))
    print(count)
def read_csv(name, unique='id'):
    uniquevalues = set()
    rows = []
    for line, row in enumerate(csv_items(name)):
        line += 2
        if unique:
            if unique not in row:
                error('unique key missing: %s' % unique, name, line)
                continue
            if row[unique] in uniquevalues:
                error('non-unique id: %s' % row[unique], name, line)
            uniquevalues.add(row[unique])
        rows.append((line, row))
    return rows
Beispiel #7
0
def read_csv(name, unique='id'):
    uniquevalues = set()
    rows = []
    for line, row in enumerate(csv_items(name)):
        line += 2
        if unique:
            if unique not in row:
                error('unique key missing: %s' % unique, name, line)
                continue
            if row[unique] in uniquevalues:
                error('non-unique id: %s' % row[unique], name, line)
            uniquevalues.add(row[unique])
        rows.append((line, row))
    return rows
Beispiel #8
0
def update(p):
    data = jsonload(data_file('cn', 'images.json'), default={})
    try:
        info = None
        for img in csv_items('cn/' + p):
            key = '%s-%s' % (img['taxa__id'], img['tags'])
            if key in data:
                print('+++', img['id'] or img['source'], data[key]['source'])
                continue
            info = get_image_info(img)
            if info:
                data[key] = get_image(info, data_file('cn', 'images'))
    except:
        print('----->')
        print(img)
        if info:
            print(info)
        jsondump(data, data_file('cn', 'images.json'), indent=4)
        raise
    jsondump(data, data_file('cn', 'images.json'), indent=4)
Beispiel #9
0
def update(p):
    data = jsonload(data_file('cn', 'images.json'), default={})
    try:
        info = None
        for img in csv_items('cn/' + p):
            key = '%s-%s' % (img['taxa__id'], img['tags'])
            if key in data:
                print('+++', img['id'] or img['source'], data[key]['source'])
                continue
            info = get_image_info(img)
            if info:
                data[key] = get_image(info, data_file('cn', 'images'))
    except:
        print('----->')
        print(img)
        if info:
            print(info)
        jsondump(data, data_file('cn', 'images.json'), indent=4)
        raise
    jsondump(data, data_file('cn', 'images.json'), indent=4)
Beispiel #10
0
            for key in 'kingdom phylum class order genus family'.split():
                if result.get(key):
                    taxon[key] = result[key]
            if 'taxonRank' in result:
                taxon['taxonRank'] = result['taxonRank'].lower()


def save_occurrences(sid, sname):
    api = GBIF()
    out = data_file('external', 'gbif', '%s.json' % sid)
    if not os.path.exists(out):
        try:
            res = api.get_info(api.get_id(sname))
            jsondump(res, out)
            print('%s: %s occurrences' % (sname, min([res['count'], res['limit']])))
        except:
            # we'll have to try again next time!
            res = None
    else:
        try:
            res = jsonload(out)
        except:
            os.remove(out)
            res = None
    return res


if __name__ == '__main__':
    for item in csv_items('taxa.csv'):
        save_occurrences(item['id'], item['scientific_name'])
Beispiel #11
0
def do_check(fname):
    existing = {(i['taxa__id'], i['tags']): i
                for i in csv_items('images.csv')
                if 'edmond' in i['source_url']}
    visit(fname, RemoveUploaded(existing))
Beispiel #12
0
                if result.get(key):
                    taxon[key] = result[key]
            if 'taxonRank' in result:
                taxon['taxonRank'] = result['taxonRank'].lower()


def save_occurrences(sid, sname):
    api = GBIF()
    out = data_file('external', 'gbif', '%s.json' % sid)
    if not os.path.exists(out):
        try:
            res = api.get_info(api.get_id(sname))
            jsondump(res, out)
            print('%s: %s occurrences' %
                  (sname, min([res['count'], res['limit']])))
        except:
            # we'll have to try again next time!
            res = None
    else:
        try:
            res = jsonload(out)
        except:
            os.remove(out)
            res = None
    return res


if __name__ == '__main__':
    for item in csv_items('taxa.csv'):
        save_occurrences(item['id'], item['scientific_name'])
Beispiel #13
0
def do_check(fname):
    existing = {(i['taxa__id'], i['tags']): i for i in
                csv_items('images.csv') if 'edmond' in i['source_url']}
    visit(fname, RemoveUploaded(existing))
Beispiel #14
0
def dedup():
    existing = [i['id'] for i in csv_items('images.csv') if 'edmond' in i['source_url']]
    d = Deduplicator(existing)
    visit('cn/images.csv', d)
    print(d.count)
Beispiel #15
0
        ('genus', item['genus'].capitalize() or None),
        ('ecoregions', split_ids(item.get('ecoregions__ids', ''))),
        ('countries', split_ids(item.get('countries__ids', ''))),
        ('wikipedia_url', wikipedia_url(item.get('wikipedia_url', ''))),
        ('eol_id', None),
        ('gbif_id', None),
        ('catalogueoflife_id', None),
    ]:
        spec[k] = v
    return spec


if __name__ == '__main__':
    fname = data_file('taxa.json')
    taxa = jsonload(fname, default=[], object_pairs_hook=OrderedDict)
    ids = set(spec['id'] for spec in taxa)

    # add stubs for new entries in taxa.csv:
    for i, item in enumerate(csv_items('taxa.csv')):
        if item['id'] not in ids:
            taxa.insert(i, item2spec(item))

    for cls in [CatalogueOfLife, GBIF, EOL]:
        with cls() as provider:
            for i, spec in enumerate(taxa):
                if i % 500 == 0:
                    print(i)
                provider.update_taxon(spec)

    jsondump(taxa, fname, indent=4)