def extract_from_file(source_settings, destination): with File(source_settings.filename).iter() as handle: for g, d in Q.groupby(handle, size=BATCH_SIZE): try: d2 = map( lambda (x): { "id": x.id, "value": x }, map( lambda (x): transform_bugzilla.normalize( CNV.JSON2object(fix_json(x))), d)) destination.add(d2) except Exception, e: filename = "Error_" + Random.hex(20) + ".txt" File(filename).write(d) D.warning("Can not convert block {{block}} (file={{host}})", { "block": g, "filename": filename }, e)
def main(settings): #USE A FILE if settings.source.filename is not None: settings.destination.alias = settings.destination.index settings.destination.index = settings.destination.alias + CNV.datetime2string( datetime.utcnow(), "%Y%m%d_%H%M%S") schema = CNV.JSON2object(File(settings.source.schema_filename).read()) dest = ElasticSearch.create_index(settings.destination, schema) dest.set_refresh_interval(-1) extract_from_file(settings.source, dest) dest.set_refresh_interval(1) dest.delete_all_but(settings.destination.alias, settings.destination.index) dest.add_alias(settings.destination.alias) return # SYNCH WITH source ES INDEX source = ElasticSearch(settings.source) destination = get_or_create_index(settings["destination"], source) last_updated = get_last_updated(destination) - timedelta(days=7) pending = get_pending(source, last_updated) # pending IS IN {"bug_id":b, "count":c} FORM # MAIN ETL LOOP for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE): data = source.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "terms": { "bug_id": bugs } }, { "range": { "modified_ts": { "gte": CNV.datetime2milli(last_updated) } } }] } } }, "from": 0, "size": 200000, "sort": [] }) d2 = map( lambda (x): { "id": x.id, "value": x }, map( lambda (x): transform_bugzilla.normalize( transform_bugzilla.rename_attachments(x)), data.hits.hits)) destination.add(d2)