def replicate(source, destination, pending, last_updated): """ COPY source RECORDS TO destination """ for g, bugs in Q.groupby(pending, max_size=BATCH_SIZE): with Timer("Replicate {{num_bugs}} bug versions", {"num_bugs": len(bugs)}): data = source.search({ "query": {"filtered": { "query": {"match_all": {}}, "filter": {"and": [ {"terms": {"bug_id": set(bugs)}}, {"range": {"modified_ts": {"gte": CNV.datetime2milli(last_updated)} }} ]} }}, "from": 0, "size": 200000, "sort": [] }) d2 = map( lambda(x): {"id": x.id, "value": x}, map( lambda(x): transform_bugzilla.normalize(transform_bugzilla.rename_attachments(x._source), old_school=True), data.hits.hits ) ) destination.extend(d2)
def incremental_etl(settings, param, db, es, es_comments, output_queue): #################################################################### ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS #################################################################### #REMOVE PRIVATE BUGS private_bugs = get_private_bugs_for_delete(db, param) Log.note( "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": private_bugs}) for g, delete_bugs in Q.groupby(private_bugs, size=1000): still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}}) if still_existing: Log.note( "Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": still_existing}) es.delete_record({"terms": {"bug_id": delete_bugs}}) es_comments.delete_record({"terms": {"bug_id": delete_bugs}}) #RECENT PUBLIC BUGS possible_public_bugs = get_recent_private_bugs(db, param) if param.allow_private_bugs: #PRIVATE BUGS # A CHANGE IN PRIVACY INDICATOR MEANS THE WHITEBOARD IS AFFECTED, REDO es.delete_record({"terms": {"bug_id": possible_public_bugs}}) else: #PUBLIC BUGS # IF ADDING GROUP THEN private_bugs ALREADY DID THIS # IF REMOVING GROUP THEN NO RECORDS TO DELETE pass #REMOVE **RECENT** PRIVATE ATTACHMENTS private_attachments = get_recent_private_attachments(db, param) bugs_to_refresh = set(Q.select(private_attachments, "bug_id")) es.delete_record({"terms": {"bug_id": bugs_to_refresh}}) #REBUILD BUGS THAT GOT REMOVED bug_list = (possible_public_bugs | bugs_to_refresh) - private_bugs # REMOVE PRIVATE BUGS if bug_list: refresh_param = param.copy() refresh_param.bug_list = bug_list refresh_param.start_time = 0 refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0) try: etl(db, output_queue, refresh_param.copy(), please_stop=None) etl_comments(db, es_comments, refresh_param.copy(), please_stop=None) except Exception, e: Log.error("Problem with etl using parameters {{parameters}}", {"parameters": refresh_param}, e)
def etl_comments(db, es, param, please_stop): # CONNECTIONS ARE EXPENSIVE, CACHE HERE with comment_db_cache_lock: if not comment_db_cache: comment_db = DB(db) comment_db_cache.append(comment_db) with comment_db_cache_lock: Log.note("Read comments from database") comments = get_comments(comment_db_cache[0], param) for g, c in Q.groupby(comments, size=500): with Timer("Write {{num}} comments to ElasticSearch", {"num": len(c)}): es.extend({"id": cc.comment_id, "value": cc} for cc in c)
def incremental_etl(settings, param, db, es, es_comments, output_queue): #################################################################### ## ES TAKES TIME TO DELETE RECORDS, DO DELETE FIRST WITH HOPE THE ## INDEX GETS A REWRITE DURING ADD OF NEW RECORDS #################################################################### #REMOVE PRIVATE BUGS private_bugs = get_private_bugs_for_delete(db, param) Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": private_bugs}) for g, delete_bugs in Q.groupby(private_bugs, size=1000): still_existing = get_bug_ids(es, {"terms": {"bug_id": delete_bugs}}) if still_existing: Log.note("Ensure the following private bugs are deleted:\n{{private_bugs|indent}}", {"private_bugs": still_existing}) es.delete_record({"terms": {"bug_id": delete_bugs}}) es_comments.delete_record({"terms": {"bug_id": delete_bugs}}) #RECENT PUBLIC BUGS possible_public_bugs = get_recent_private_bugs(db, param) if param.allow_private_bugs: #PRIVATE BUGS # A CHANGE IN PRIVACY INDICATOR MEANS THE WHITEBOARD IS AFFECTED, REDO es.delete_record({"terms": {"bug_id": possible_public_bugs}}) else: #PUBLIC BUGS # IF ADDING GROUP THEN private_bugs ALREADY DID THIS # IF REMOVING GROUP THEN NO RECORDS TO DELETE pass #REMOVE **RECENT** PRIVATE ATTACHMENTS private_attachments = get_recent_private_attachments(db, param) bugs_to_refresh = set(Q.select(private_attachments, "bug_id")) es.delete_record({"terms": {"bug_id": bugs_to_refresh}}) #REBUILD BUGS THAT GOT REMOVED bug_list = (possible_public_bugs | bugs_to_refresh) - private_bugs # REMOVE PRIVATE BUGS if bug_list: refresh_param = param.copy() refresh_param.bug_list = bug_list refresh_param.start_time = 0 refresh_param.start_time_str = extract_bugzilla.milli2string(db, 0) try: etl(db, output_queue, refresh_param.copy(), please_stop=None) etl_comments(db, es_comments, refresh_param.copy(), please_stop=None) except Exception, e: Log.error("Problem with etl using parameters {{parameters}}", { "parameters": refresh_param }, e)
def extract_from_file(source_settings, destination): with File(source_settings.filename) as handle: for g, d in Q.groupby(handle, size=BATCH_SIZE): try: d2 = map( lambda (x): {"id": x.id, "value": x}, map( lambda(x): transform_bugzilla.normalize(CNV.JSON2object(fix_json(x))), d ) ) destination.add(d2) except Exception, e: filename = "Error_" + unicode(g) + ".txt" File(filename).write(d) Log.warning("Can not convert block {{block}} (file={{host}})", { "block": g, "filename": filename }, e)