Esempio n. 1
0
def clear(source):
    es = PrefixedElasticsearch()
    lazo_client = lazo_index_service.LazoIndexClient(
        host=os.environ['LAZO_SERVER_HOST'],
        port=int(os.environ['LAZO_SERVER_PORT']))
    hits = es.scan(
        index='datasets,pending',
        query={
            'query': {
                'bool': {
                    'should': [
                        {
                            'term': {
                                'materialize.identifier': source,
                            },
                        },
                        {
                            'term': {
                                'source': source,
                            },
                        },
                    ],
                    'minimum_should_match':
                    1,
                },
            },
        },
        _source=False,
        size=SIZE,
    )
    for h in hits:
        delete_dataset_from_index(es, h['_id'], lazo_client)
Esempio n. 2
0
def delete(datasets):
    es = PrefixedElasticsearch()
    lazo_client = lazo_index_service.LazoIndexClient(
        host=os.environ['LAZO_SERVER_HOST'],
        port=int(os.environ['LAZO_SERVER_PORT']))
    for dataset in datasets:
        delete_dataset_from_index(es, dataset, lazo_client)
Esempio n. 3
0
async def import_all(folder):
    es = PrefixedElasticsearch()
    if 'LAZO_SERVER_HOST' in os.environ:
        lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT']))
    else:
        lazo_client = None

    dataset_docs = []
    lazo_docs = []
    for name in os.listdir(folder):
        if name.startswith('lazo.'):
            lazo_docs.append(name)
        else:
            dataset_docs.append(name)

    for i, name in enumerate(dataset_docs):
        if i % 50 == 0:
            print(
                "\nImporting to Elasticsearch, %d/%d" % (i, len(dataset_docs)),
                flush=True,
            )
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name)
        try:
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        print('.', end='', flush=True)

    for i, name in enumerate(lazo_docs):
        if i % 500 == 0:
            print(
                "\nImporting to Lazo, %d/%d" % (i, len(lazo_docs)),
                flush=True,
            )
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name[5:]).rsplit('.', 1)[0]
        lazo_es_id = obj.pop('_id')
        assert lazo_es_id.split('__.__')[0] == dataset_id
        try:
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        if i % 10 == 0:
            print('.', end='', flush=True)
Esempio n. 4
0
async def freshen(version):
    # Check that it's a valid version
    version_hash = subprocess.check_output(['git', 'rev-parse', version])
    version_hash = version_hash.decode('ascii').strip()
    logger.warning("Reprocessing datasets profiled before %s", version_hash)

    es = PrefixedElasticsearch()

    amqp_conn = await aio_pika.connect_robust(
        host=os.environ['AMQP_HOST'],
        port=int(os.environ['AMQP_PORT']),
        login=os.environ['AMQP_USER'],
        password=os.environ['AMQP_PASSWORD'],
    )
    amqp_chan = await amqp_conn.channel()
    amqp_profile_exchange = await amqp_chan.declare_exchange(
        'profile',
        aio_pika.ExchangeType.FANOUT,
    )

    hits = es.scan(
        index='datasets',
        query={
            'query': {
                'match_all': {},
            },
        },
        size=SIZE,
    )
    reprocessed = 0
    for h in hits:
        obj = h['_source']
        dataset_version = obj['version']
        if is_version_more_recent(version, dataset_version):
            logger.debug("%s is recent enough (version=%r)",
                         h['_id'], dataset_version)
            continue

        reprocessed += 1
        logger.info("Reprocessing %s, version=%r",
                    h['_id'], dataset_version)
        metadata = dict(name=obj['name'],
                        materialize=obj['materialize'],
                        source=obj.get('source', 'unknown'))
        if obj.get('description'):
            metadata['description'] = obj['description']
        if obj.get('date'):
            metadata['date'] = obj['date']
        if obj.get('manual_annotations'):
            metadata['manual_annotations'] = obj['manual_annotations']
        await amqp_profile_exchange.publish(
            json2msg(dict(id=h['_id'], metadata=metadata)),
            '',
        )
    logger.info("Reprocessed %d datasets", reprocessed)
Esempio n. 5
0
def search():
    es = PrefixedElasticsearch()
    hits = es.scan(
        index='datasets',
        query={
            'query': {
                'range': {
                    "size": {
                        "gt": 10000000000,  # 10 GB
                    },
                },
            },
        },
        _source='size',
        size=SIZE,
    )
    for h in hits:
        print("%s %.1f GB" % (h['_id'], h['_source']['size'] / 1000000000.0))
Esempio n. 6
0
def export():
    es = PrefixedElasticsearch()

    print("Dumping datasets", end='', flush=True)
    hits = es.scan(
        index='datasets',
        query={
            'query': {
                'match_all': {},
            },
        },
        size=SIZE,
    )
    for h in hits:
        # Use dataset ID as file name
        with open(encode_dataset_id(h['_id']), 'w') as fp:
            json.dump(h['_source'], fp, sort_keys=True, indent=2)

    print("Dumping Lazo data", end='', flush=True)
    hits = es.scan(
        index='lazo',
        query={
            'query': {
                'match_all': {},
            },
        },
        size=SIZE,
    )
    for h in hits:
        # Use "lazo." dataset_id ".NB" as file name
        dataset_id = h['_id'].split('__.__')[0]
        fname = unique_filename(
            'lazo.{0}.{{0}}'.format(encode_dataset_id(dataset_id))
        )
        with open(fname, 'w') as fp:
            json.dump(
                dict(h['_source'], _id=h['_id']),
                fp,
                sort_keys=True,
                indent=2,
            )
Esempio n. 7
0
async def freshen(datasets, priority):
    es = PrefixedElasticsearch()

    amqp_conn = await aio_pika.connect_robust(
        host=os.environ['AMQP_HOST'],
        port=int(os.environ['AMQP_PORT']),
        login=os.environ['AMQP_USER'],
        password=os.environ['AMQP_PASSWORD'],
    )
    amqp_chan = await amqp_conn.channel()
    amqp_profile_exchange = await amqp_chan.declare_exchange(
        'profile',
        aio_pika.ExchangeType.FANOUT,
    )

    for dataset_id in datasets:
        try:
            obj = es.get('datasets', dataset_id)['_source']
        except elasticsearch.NotFoundError:
            obj = es.get('pending', dataset_id)['_source']['metadata']
            dataset_version = None
        else:
            dataset_version = obj['version']

        logger.info("Reprocessing %s, version=%r", dataset_id, dataset_version)
        metadata = dict(name=obj['name'],
                        materialize=obj['materialize'],
                        source=obj.get('source', 'unknown'))
        if obj.get('description'):
            metadata['description'] = obj['description']
        if obj.get('date'):
            metadata['date'] = obj['date']
        if obj.get('manual_annotations'):
            metadata['manual_annotations'] = obj['manual_annotations']
        await amqp_profile_exchange.publish(
            json2msg(
                dict(id=dataset_id, metadata=metadata),
                priority=priority,
            ),
            '',
        )
Esempio n. 8
0
def count():
    es = PrefixedElasticsearch()
    sources = {}
    hits = es.scan(
        index='datasets',
        query={
            'query': {
                'match_all': {},
            },
        },
        _source='source',
        size=SIZE,
    )
    for h in hits:
        source = h['_source']['source']

        try:
            sources[source] += 1
        except KeyError:
            sources[source] = 1

    for identifier, count in sorted(sources.items(), key=lambda p: -p[1]):
        print('{: 6d} {}'.format(count, identifier))
Esempio n. 9
0
    def __init__(self):
        self.profile_semaphore = threading.Semaphore(MAX_CONCURRENT_PROFILE)
        self.es = PrefixedElasticsearch()
        self.lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT'])
        )
        if os.environ.get('NOMINATIM_URL'):
            self.nominatim = os.environ['NOMINATIM_URL']
        else:
            self.nominatim = None
            logger.warning(
                "$NOMINATIM_URL is not set, not resolving addresses"
            )
        self.geo_data = GeoData.from_local_cache()
        self.channel = None

        assert(os.path.isdir('/cache/datasets'))

        self.loop = asyncio.get_event_loop()
        log_future(self.loop.create_task(self._run()), logger,
                   should_never_exit=True)

        # Retry a few times, in case the Elasticsearch container is not yet up
        for i in itertools.count():
            try:
                if not self.es.index_exists('datasets'):
                    raise RuntimeError("'datasets' index does not exist")
            except Exception:
                logger.warning("Can't connect to Elasticsearch, retrying...")
                if i == 5:
                    raise
                else:
                    time.sleep(5)
            else:
                break