def replicate(source, destination, pending, last_updated): """ COPY source RECORDS TO destination """ for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE): with Timer("Replicate {{num_bugs}} bug versions", {"num_bugs": len(bugs)}): data = source.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"terms": {"bug_id": set(bugs)}}, {"range": {"modified_ts": {"gte": CNV.datetime2milli(last_updated)} }} ]} }}, "from": 0, "size": 200000, "sort": [] }) d2 = map( lambda(x): {"id": x.id, "value": x}, map( lambda(x): transform_bugzilla.normalize(transform_bugzilla.rename_attachments(x._source), old_school=True), data.hits.hits ) ) destination.extend(d2)
def extract_from_file(source_settings, destination): with File(source_settings.filename).iter() as handle: for g, d in Q.groupby(handle, size=BATCH_SIZE): try: d2=map(lambda(x): {"id":x.id, "value":x}, map(lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))), d)) destination.add(d2) except Exception, e: filename="Error_"+Random.hex(20)+".txt" File(filename).write(d) D.warning("Can not convert block {{block}} (file={{host}})", {"block":g, "filename":filename}, e)
def main(settings): #USE A FILE if settings.source.filename is not None: settings.destination.alias=settings.destination.index settings.destination.index=settings.destination.alias+CNV.datetime2string(datetime.utcnow(), "%Y%m%d_%H%M%S") schema=CNV.JSON2object(File(settings.source.schema_filename).read()) dest=ElasticSearch.create_index(settings.destination, schema) dest.set_refresh_interval(-1) extract_from_file(settings.source, dest) dest.set_refresh_interval(1) dest.delete_all_but(settings.destination.alias, settings.destination.index) dest.add_alias(settings.destination.alias) return # SYNCH WITH source ES INDEX source=ElasticSearch(settings.source) destination=get_or_create_index(settings["destination"], source) last_updated=get_last_updated(destination)-timedelta(days=7) pending=get_pending(source, last_updated) # pending IS IN {"bug_id":b, "count":c} FORM # MAIN ETL LOOP for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE): data=source.search({ "query":{"filtered":{ "query":{"match_all":{}}, "filter":{"and":[ {"terms":{"bug_id":bugs}}, {"range":{"modified_ts":{"gte":CNV.datetime2milli(last_updated)}}} ]} }}, "from":0, "size":200000, "sort":[] }) d2=map( lambda(x): {"id":x.id, "value":x}, map( lambda(x): transform_bugzilla.normalize(transform_bugzilla.rename_attachments(x)), data.hits.hits ) ) destination.add(d2)
def extract_from_file(source_settings, destination): with File(source_settings.filename) as handle: for g, d in Q.groupby(handle, size=BATCH_SIZE): try: d2 = map( lambda (x): {"id": x.id, "value": x}, map( lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))), d ) ) destination.add(d2) except Exception, e: filename = "Error_" + unicode(g) + ".txt" File(filename).write(d) Log.warning("Can not convert block {{block}} (file={{host}})", { "block": g, "filename": filename }, e)
def main(settings): #USE A FILE if settings.source.filename is not None: settings.destination.alias = settings.destination.index settings.destination.index = settings.destination.alias + CNV.datetime2string( datetime.utcnow(), "%Y%m%d_%H%M%S") schema = CNV.JSON2object(File(settings.source.schema_filename).read()) dest = ElasticSearch.create_index(settings.destination, schema) dest.set_refresh_interval(-1) extract_from_file(settings.source, dest) dest.set_refresh_interval(1) dest.delete_all_but(settings.destination.alias, settings.destination.index) dest.add_alias(settings.destination.alias) return # SYNCH WITH source ES INDEX source = ElasticSearch(settings.source) destination = get_or_create_index(settings["destination"], source) last_updated = get_last_updated(destination) - timedelta(days=7) pending = get_pending(source, last_updated) # pending IS IN {"bug_id":b, "count":c} FORM # MAIN ETL LOOP for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE): data = source.search({ "query": { "filtered": { "query": { "match_all": {} }, "filter": { "and": [{ "terms": { "bug_id": bugs } }, { "range": { "modified_ts": { "gte": CNV.datetime2milli(last_updated) } } }] } } }, "from": 0, "size": 200000, "sort": [] }) d2 = map( lambda (x): { "id": x.id, "value": x }, map( lambda (x): transform_bugzilla.normalize( transform_bugzilla.rename_attachments(x)), data.hits.hits)) destination.add(d2)