Beispiel #1
0
def main():
    res = {}
    with open(DATA_FILE, encoding='utf8') as fp:
        for line in fp.read().split('\n'):
            if line:
                cols = line.split(',')
                res[cols[0]] = cols[1:]

    ecoregions = [(er['properties']['eco_code'], shape(er['geometry']))
                  for er in jsonload(data_file('ecoregions.json'))['features']
                  if er['geometry']
                  and er['properties']['eco_code'] not in INVALID_ECO_CODES]

    for fname in os.listdir(data_file('external', 'gbif')):
        sid = fname.split('.')[0]
        v = res.get(sid, ['', ''])
        if len(v) == 1:
            v.append('')
        if not v[0] or not v[1]:
            occurrences = jsonload(data_file('external', 'gbif',
                                             fname)).get('results', [])
        if not v[0]:
            v[0] = format_ids(match(occurrences, ecoregions))
        if not v[1]:
            v[1] = format_ids(r.get('countryCode') for r in occurrences)
        res[sid] = v

    with open(DATA_FILE, 'w', encoding='utf8') as fp:
        for key in sorted(res.keys()):
            fp.write('%s,%s\r\n' % (key, ','.join(res[key])))
Beispiel #2
0
def check(p):
    count = 0
    existing = [i['id'] for i in csv_items('cn/' + p) if 'edmond' in i['source_url']]
    for id, fname in [(n.split('.')[0], n) for n in os.listdir(data_file('cn/images'))]:
        if id in existing:
            count += 1
            os.remove(data_file('cn', 'images', fname))
    print(count)
Beispiel #3
0
def check(p):
    count = 0
    existing = [
        i['id'] for i in csv_items('cn/' + p) if 'edmond' in i['source_url']
    ]
    for id, fname in [(n.split('.')[0], n)
                      for n in os.listdir(data_file('cn/images'))]:
        if id in existing:
            count += 1
            os.remove(data_file('cn', 'images', fname))
    print(count)
def test():
    data = {n: read_csv(n) for n in CSV}
    ids = {n: {r[1]['id'] for r in rows} for n, rows in data.items()}

    ids['ecoregions'] = set()
    for ecoregion in jsonload(data_file('ecoregions.json'))['features']:
        ids['ecoregions'].add(ecoregion['properties']['eco_code'])

    ids['sources'] = set()
    with io.open(data_file('sources.bib'), encoding='utf8') as fp:
        for line in fp:
            match = BIB_ID_PATTERN.match(line.strip())
            if match:
                ids['sources'].add(match.group('id'))

    ids['countries'] = set([country.alpha2 for country in countries])

    def check_ref(name, line, item):
        for ref in item['refs__ids'].split(';'):
            if ref:
                if '[' in ref:
                    source_id, pages = ref.split('[', 1)
                    if not pages.endswith(']'):
                        error('invalid reference %s' % (ref, ), name, line)
                else:
                    source_id = ref
                if source_id not in ids['sources']:
                    error('invalid sources id referenced: %s' % (source_id, ),
                          name, line)

    for name in ['names', 'taxa']:
        for line, item in data[name]:
            check_ref(name, line, item)

    for name, items in data.items():
        for line, item in items:
            for col in item.keys():
                if '__' in col:
                    ref, card = col.split('__', 1)
                    if ref not in ids:
                        continue
                    for v in split_ids(item[col]):
                        if v not in ids[ref]:
                            error('invalid %s id referenced: %s' % (ref, v),
                                  name, line)

    if not SUCCESS:
        raise ValueError('integrity checks failed!')
Beispiel #5
0
def update_taxa():
    parser = argparse.ArgumentParser(
        description="""\
Update the supplemental data for taxa from external sources.

We go through the taxa listed in taxa.csv and look for additional information at
GBIF, EOL and Catalogue Of Life.""")
    parser.add_argument("--distribution-only", action="store_true")
    args = parser.parse_args()

    if not args.distribution_only:
        fname = data_file('taxa.json')
        taxa = jsonload(fname, default=[], object_pairs_hook=OrderedDict)
        ids = set(spec['id'] for spec in taxa)

        # add stubs for new entries in taxa.csv:
        for i, item in enumerate(csv_items('taxa.csv')):
            if item['id'] not in ids:
                taxa.insert(i, item2spec(item))

        for cls in [CatalogueOfLife, GBIF, EOL]:
            with cls() as provider:
                for i, spec in enumerate(taxa):
                    if i % 500 == 0:
                        print(i)
                    provider.update_taxon(spec)

        jsondump(taxa, fname, indent=4)

    main()
Beispiel #6
0
def update_taxa():
    parser = argparse.ArgumentParser(description="""\
Update the supplemental data for taxa from external sources.

We go through the taxa listed in taxa.csv and look for additional information at
GBIF, EOL and Catalogue Of Life.""")
    parser.add_argument("--distribution-only", action="store_true")
    args = parser.parse_args()

    if not args.distribution_only:
        fname = data_file('taxa.json')
        taxa = jsonload(fname, default=[], object_pairs_hook=OrderedDict)
        ids = set(spec['id'] for spec in taxa)

        # add stubs for new entries in taxa.csv:
        for i, item in enumerate(csv_items('taxa.csv')):
            if item['id'] not in ids:
                taxa.insert(i, item2spec(item))

        for cls in [CatalogueOfLife, GBIF, EOL]:
            with cls() as provider:
                for i, spec in enumerate(taxa):
                    if i % 500 == 0:
                        print(i)
                    provider.update_taxon(spec)

        jsondump(taxa, fname, indent=4)

    main()
Beispiel #7
0
def test():
    data = {n: read_csv(n) for n in CSV}
    ids = {n: {r[1]['id'] for r in rows} for n, rows in data.items()}

    ids['ecoregions'] = set()
    for ecoregion in jsonload(data_file('ecoregions.json'))['features']:
        ids['ecoregions'].add(ecoregion['properties']['eco_code'])

    ids['sources'] = set()
    with io.open(data_file('sources.bib'), encoding='utf8') as fp:
        for line in fp:
            match = BIB_ID_PATTERN.match(line.strip())
            if match:
                ids['sources'].add(match.group('id'))

    ids['countries'] = set([country.alpha2 for country in countries])

    def check_ref(name, line, item):
        for ref in item['refs__ids'].split(';'):
            if ref:
                if '[' in ref:
                    source_id, pages = ref.split('[', 1)
                    if not pages.endswith(']'):
                        error('invalid reference %s' % (ref,), name, line)
                else:
                    source_id = ref
                if source_id not in ids['sources']:
                    error('invalid sources id referenced: %s' % (source_id,), name, line)

    for name in ['names', 'taxa']:
        for line, item in data[name]:
            check_ref(name, line, item)

    for name, items in data.items():
        for line, item in items:
            for col in item.keys():
                if col and '__' in col:
                    ref, card = col.split('__', 1)
                    if ref not in ids:
                        continue
                    for v in split_ids(item[col]):
                        if v not in ids[ref]:
                            error('invalid %s id referenced: %s' % (ref, v), name, line)

    if not SUCCESS:
        raise ValueError('integrity checks failed!')
Beispiel #8
0
def update(p):
    data = jsonload(data_file('cn', 'images.json'), default={})
    try:
        info = None
        for img in csv_items('cn/' + p):
            key = '%s-%s' % (img['taxa__id'], img['tags'])
            if key in data:
                print('+++', img['id'] or img['source'], data[key]['source'])
                continue
            info = get_image_info(img)
            if info:
                data[key] = get_image(info, data_file('cn', 'images'))
    except:
        print('----->')
        print(img)
        if info:
            print(info)
        jsondump(data, data_file('cn', 'images.json'), indent=4)
        raise
    jsondump(data, data_file('cn', 'images.json'), indent=4)
Beispiel #9
0
def update(p):
    data = jsonload(data_file('cn', 'images.json'), default={})
    try:
        info = None
        for img in csv_items('cn/' + p):
            key = '%s-%s' % (img['taxa__id'], img['tags'])
            if key in data:
                print('+++', img['id'] or img['source'], data[key]['source'])
                continue
            info = get_image_info(img)
            if info:
                data[key] = get_image(info, data_file('cn', 'images'))
    except:
        print('----->')
        print(img)
        if info:
            print(info)
        jsondump(data, data_file('cn', 'images.json'), indent=4)
        raise
    jsondump(data, data_file('cn', 'images.json'), indent=4)
Beispiel #10
0
def save_occurrences(sid, sname):
    api = GBIF()
    out = data_file('external', 'gbif', '%s.json' % sid)
    if not os.path.exists(out):
        try:
            res = api.get_info(api.get_id(sname))
            jsondump(res, out)
            print('%s: %s occurrences' % (sname, min([res['count'], res['limit']])))
        except:
            # we'll have to try again next time!
            res = None
    else:
        try:
            res = jsonload(out)
        except:
            os.remove(out)
            res = None
    return res
Beispiel #11
0
def save_occurrences(sid, sname):
    api = GBIF()
    out = data_file('external', 'gbif', '%s.json' % sid)
    if not os.path.exists(out):
        try:
            res = api.get_info(api.get_id(sname))
            jsondump(res, out)
            print('%s: %s occurrences' %
                  (sname, min([res['count'], res['limit']])))
        except:
            # we'll have to try again next time!
            res = None
    else:
        try:
            res = jsonload(out)
        except:
            os.remove(out)
            res = None
    return res
Beispiel #12
0
def rewrite(p):
    visit('cn/' + p, JSON2CSV(data_file('cn', 'images.json')))
Beispiel #13
0
def select(p):
    shutil.copy(data_file('cn', p), data_file('cn', 'staged_images.csv'))
    visit('cn/staged_images.csv', Selector())
    print(
        len(open(data_file('cn', 'staged_images.csv')).read().split('\n')) - 1)
 def __init__(self):
     self._data = {i['id']: i for i in jsonload(data_file('taxa.json'))}
Beispiel #15
0
def rewrite(p):
    visit('cn/' + p, JSON2CSV(data_file('cn', 'images.json')))
Beispiel #16
0
def select(p):
    shutil.copy(data_file('cn', p), data_file('cn', 'staged_images.csv'))
    visit('cn/staged_images.csv', Selector())
    print(len(open(data_file('cn', 'staged_images.csv')).read().split('\n')) - 1)
Beispiel #17
0
 def __init__(self):
     self._data = {i['id']: i for i in jsonload(data_file('taxa.json'))}
Beispiel #18
0
        ('order', item['order'].capitalize() or None),
        ('family', item['family'].capitalize() or None),
        ('genus', item['genus'].capitalize() or None),
        ('ecoregions', split_ids(item.get('ecoregions__ids', ''))),
        ('countries', split_ids(item.get('countries__ids', ''))),
        ('wikipedia_url', wikipedia_url(item.get('wikipedia_url', ''))),
        ('eol_id', None),
        ('gbif_id', None),
        ('catalogueoflife_id', None),
    ]:
        spec[k] = v
    return spec


if __name__ == '__main__':
    fname = data_file('taxa.json')
    taxa = jsonload(fname, default=[], object_pairs_hook=OrderedDict)
    ids = set(spec['id'] for spec in taxa)

    # add stubs for new entries in taxa.csv:
    for i, item in enumerate(csv_items('taxa.csv')):
        if item['id'] not in ids:
            taxa.insert(i, item2spec(item))

    for cls in [CatalogueOfLife, GBIF, EOL]:
        with cls() as provider:
            for i, spec in enumerate(taxa):
                if i % 500 == 0:
                    print(i)
                provider.update_taxon(spec)
Beispiel #19
0
 def __init__(self):
     self.cols = {}
     with open(data_file('images_md.json'), 'rb') as fp:
         self.md = json.load(fp)
     self.count = 0
Beispiel #20
0
from __future__ import print_function, unicode_literals
import os
from io import open

from shapely.geometry import shape, Point
from shapely.geos import PredicateError, TopologicalError

from tsammalexdata.util import data_file, jsonload, unique

INVALID_ECO_CODES = {'AA0803', 'Lake', 'AT1202', 'IM1303', 'AA0803'}
DATA_FILE = data_file('distribution.csv')


def format_ids(iterable):
    return ';'.join(unique(iterable))


def main():
    res = {}
    with open(DATA_FILE, encoding='utf8') as fp:
        for line in fp.read().split('\n'):
            if line:
                cols = line.split(',')
                res[cols[0]] = cols[1:]

    ecoregions = [(er['properties']['eco_code'], shape(er['geometry']))
                  for er in jsonload(data_file('ecoregions.json'))['features']
                  if er['geometry']
                  and er['properties']['eco_code'] not in INVALID_ECO_CODES]

    for fname in os.listdir(data_file('external', 'gbif')):
Beispiel #21
0
 def __init__(self):
     self.edmond_urls = file_urls(data_file('Edmond.xml'))
     self.cols = {}
     self.count = 0
Beispiel #22
0
        self.edmond_urls = file_urls(data_file('Edmond.xml'))
        self.cols = {}
        self.count = 0

    def __call__(self, index, row):
        if index == 0:
            self.cols = {col: i for i, col in enumerate(row)}
            return row

        _id = row[self.cols['id']]

        if _id in self.edmond_urls:
            row[self.cols['source_url']] = self.edmond_urls[_id]['full']
            self.count += 1
        else:
            #
            # FIXME: check whether source_url is an Edmond image URL, if not, upload the
            # image to Edmond, insert the URL here! Depends on the imeji API being
            # available on Edmond.
            #
            print(_id, row)
        return row


if __name__ == '__main__':
    with open(data_file('Edmond.xml'), 'w', encoding='utf8') as fp:
        fp.write(requests.get(URL).text)
    v = Visitor()
    visit(sys.argv[1] if len(sys.argv) > 1 else 'images.csv', v)
    print(v.count)