def make_processed_list(s): processed_list = utils.read_from_s3("processed_list", seed=[], directory=s["s3dir"]) # if not processed_list: # statuses_enhanced = utils.read_from_s3(utils.file_name( sufix = "_enhanced"), directory=s["s3dir"]) # processed_list = [int(status["id"]) for status in statuses_enhanced] return processed_list
def control(batch_enhanced,s): # statuses_enhanced = utils.read_from_s3(utils.file_name( sufix = "_enhanced"), directory='data-aws/gen_two/') batch_enhanced_full = utils.read_from_s3(utils.file_name( sufix = "_batch_enhanced_full"), seed=[], directory=s["s3dir"]) print("Length of enhanced pre batch: " + str(len(batch_enhanced_full))) # enhanced_w_batch = add_batch(batch_enhanced_full, batch_enhanced) # utils.write_to_s3(enhanced_w_batch, utils.file_name( sufix = "_batch_enhanced_full"), directory=s["s3dir"]) # utils.write_to_s3(processed_list, utils.file_name( sufix = "_processed_list"), directory=s["s3dir"]) filtered_label_dict = make_filtered_label_dict(batch_enhanced_full) # utils.write_to_s3(filtered_label_dict, utils.file_name( prefix = "_batch_filt_label_dict_enhanced_"), directory=s["s3dir"]) return filtered_label_dict
def control(batch_enhanced, s): batch_enhanced_full = utils.read_from_s3( utils.file_name(sufix="_batch_enhanced_full"), seed=[], directory=s["s3dir"]) print("Length of enhanced pre batch: " + str(len(batch_enhanced_full))) enhanced_w_batch = add_batch(batch_enhanced_full, batch_enhanced) utils.write_to_s3(enhanced_w_batch, utils.file_name(sufix="_batch_enhanced_full"), directory=s["s3dir"]) filtered_label_dict = make_filtered_label_dict(batch_enhanced_full) return filtered_label_dict
# all_users_lookup = [u for u in all_users if u not in user_data] ## Cache code## user_data, all_users_lookup = utils.get_from_cache_m( all_users, "user_data") utils.log(len(all_users), "Number total users in set: ") utils.log(len(all_users_lookup), "Number users needing lookup: ") if len(all_users_lookup) > 0: user_chunks = make_user_chunks(all_users_lookup, 100) for this_lookup in user_chunks: user_dict_lookup = do_user_lookup(this_lookup, s) user_data.update(user_dict_lookup) return user_data, filtered_label_dict def control(filtered_label_dict, s): user_data, filtered_label_dict = update_user_info(filtered_label_dict, s) filtered_label_dict = update_dict_with_user_info(filtered_label_dict, user_data) utils.write_to_s3( filtered_label_dict, utils.file_name(prefix='_batch_filt_label_dict_enhanced_'), directory=s["s3dir"]) return filtered_label_dict if __name__ == "__main__": sd = utils.getDefaultSettings() filtered_label_dict = utils.read_from_s3( utils.file_name(prefix='_batch_filt_label_dict_enhanced_'), directory=sd["s3dir"]) control(filtered_label_dict, sd)
# if "thumb_fetch_successful" not in titles[url]["news_image"] and 'media_url_news' in titles[url]["news_image"]: if 'media_url_news' in titles[url]["news_image"]: top_image = titles[url]["news_image"]['media_url_news'] print("Need image for " + top_image + ", " + url) ## CACHE CHECKING f_base = utils.get_from_cache(top_image, "thumbs", print_hits=True) if f_base == None: f_base = make_thumb(top_image) titles[url]["news_image"] = f_base else: print("already tried fetching thumb for " + url) return titles def control(label_to_titles, s): utils.log(str(len(label_to_titles)) + " titles loaded") label_to_titles = add_image_thumbs(label_to_titles, s) utils.write_to_s3(label_to_titles, 'batch_titles_fd_' + utils.file_date() + '.json', directory=s["s3dir"]) return label_to_titles if __name__ == "__main__": sd = utils.getDefaultSettings() label_to_titles = utils.read_from_s3('batch_titles_fd_' + utils.file_date() + '.json', directory=sd["s3dir"]) control(label_to_titles, sd)
max_embed = this_embed #for links, add to "labels" so we can do the short url processing after #for refs, create entry "labels_proc so we preserve the original ref and can store the quoted status. root["satellite_enhanced"]["labels"]["quoted_labels_links_deep"] = max_url root["satellite_enhanced"]["labels"]["quoted_labels_twrefs_deep"] = max_embed if max_embed != None and max_embed != []: root["satellite_enhanced"]["labels"]["quoted_labels_twrefs_deep_status"] = qt else: existing_cnt += 1 utils.log(existing_cnt, "Existing Count: ") return statuses_enhanced def control(batch_enhanced, s): utils.log("", "Starting deep trace") # batch_enhanced = utils.read_from_s3(utils.file_name( sufix = "_enhanced"), directory="data-aws/gen_two/") batch_enhanced = trace_links_down(batch_enhanced, s) # utils.write_to_s3( # batch_enhanced, # utils.file_name(batch_enhanced, sufix="_batch_enhanced_c"), # directory=s["s3dir"]) return batch_enhanced if __name__ == "__main__": sd = utils.getDefaultSettings() batch_enhanced = utils.read_from_s3( utils.file_name(sufix="_batch_enhanced"), directory=sd["s3dir"]) control(batch_enhanced, sd)
label_data = process_label_dict(filt_label_dict, label_to_titles) sorted_label_data_list = label_data_to_list(label_data) meta = make_meta(label_data) final_data = {"label_data":sorted_label_data_list, "meta_data": meta} # name = "d3-" + utils.file_date() + "-label_format" + ".json" # utils.write_to_s3( # json.dumps(final_data), # name, # directory=s["s3dir"] + 'production/', # public=True) normalized_name = s["name"] + ".json" utils.write_to_s3( json.dumps(final_data), normalized_name, directory='data-aws/shared_data/production/'+utils.file_date() + "/", public=True) return None if __name__ == "__main__": debug = True sd = utils.getDefaultSettings() filt_label_dict = utils.read_from_s3( utils.file_name(prefix='_batch_filt_label_dict_enhanced_'), directory=sd["s3dir"]) label_to_titles = utils.read_from_s3( 'batch_titles_fd_' + utils.file_date() + '.json', directory=sd["s3dir"]) control(filt_label_dict, label_to_titles, sd)
def filter_on_day(stat): current_day_key = utils.file_date() dd = utils.make_local(stat["created_at"]) key = str(dd.day) + "-" + str(dd.month) + "-" + str(dd.year) return_bool = key == current_day_key return return_bool def enhance(batch_enhanced): filtered_enhanced_batch = [] utils.log(len(batch_enhanced), "Number batch statuses: ") for stat in batch_enhanced: enhanced = {} enhanced["created_at_tz"] = add_time_zone_date(stat) enhanced["labels"] = get_combined_labels(stat) ref_cnt = 0 for key, val in enhanced["labels"].items(): ref_cnt += len(val) if ref_cnt > 0: stat["satellite_enhanced"] = enhanced filtered_enhanced_batch.append(stat) utils.log(len(filtered_enhanced_batch), "Number enhanced batch statuses: ") return filtered_enhanced_batch def control(date_filtered_batch, s): batch_enhanced = enhance(date_filtered_batch) return batch_enhanced if __name__ == "__main__": sd = utils.getDefaultSettings() date_filtered_batch = utils.read_from_s3(utils.file_name(sufix="_batch_enhanced_full"), directory=sd["s3dir"]) control(date_filtered_batch, sd)
def control(batch_enhanced,s): filtered_label_dict = utils.read_from_s3(utils.file_name( prefix = "_batch_filt_label_dict_enhanced_fld"), directory=s["s3dir"]) filtered_label_dict, processed_list = make_filtered_label_dict(batch_enhanced, filtered_label_dict) utils.write_to_s3(filtered_label_dict, utils.file_name( prefix = "_batch_filt_label_dict_enhanced_fld"), directory=s["s3dir"]) utils.write_to_s3(processed_list, utils.file_name( sufix = "_processed_list_fld"), directory=s["s3dir"]) return filtered_label_dict
def report(job_id, x_axis, y_axis, min_members=None): """ Generate report for a job Parameters ---------- job_id: str x_axis: str Name of column from user dataset to be used for the x axis of the plot y_axis: str Name of column from user dataset to be used for the y axis of the plot min_members: int, optional Minimum number of members required in all clusters in an experiment to consider the experiment for the report. Returns ------- html """ # job_id is valid tasks = dynamo_get_tasks(job_id) if len(tasks) == 0: return 'No task found' n_tasks_done = len([x for x in tasks if x['task_status'] == 'done']) if len(tasks) != n_tasks_done: return 'All tasks not completed yet for job ID: {} {}/{}'.format( job_id, n_tasks_done, len(tasks)) # all tasks are done if min_members is None: min_members = 10 tasks = filter_by_min_members(tasks, min_members=min_members) start_time_date, start_time_clock = format_date_time( tasks[0]['created_time']) covar_types, covar_tieds, ks, labels, bics, task_ids = tasks_to_best_results( tasks) if x_axis is None or y_axis is None: # Visualize the first two columns that are not on the exclude list viz_columns = [ c for c in job['columns'] if c.lower().strip() not in EXCLUDE_COLUMNS ][:2] else: viz_columns = [x_axis, y_axis] data, columns = read_from_s3(job_id, 0, S3_BUCKET, tasks[0]['s3_file_key']) spatial_columns = [c for c in columns if c.lower() in SPATIAL_COLUMNS][:2] # recommendations for all covariance types covar_type_tied_k = {} for covar_type in covar_types: covar_type_tied_k[covar_type.capitalize()] = {} for covar_type, covar_tied, k in zip(covar_types, covar_tieds, ks): covar_type_tied_k[covar_type.capitalize()][['Untied', 'Tied'][covar_tied]] = k # task_id for all recommended assignments covar_type_tied_task_id = {} for covar_type in covar_types: covar_type_tied_task_id[covar_type.capitalize()] = {} for covar_type, covar_tied, task_id in zip(covar_types, covar_tieds, task_ids): covar_type_tied_task_id[covar_type.capitalize()][[ 'Untied', 'Tied' ][covar_tied]] = task_id result = dict(job_id=job_id, min_members=min_members, covar_type_tied_k=covar_type_tied_k, covar_type_tied_task_id=covar_type_tied_task_id, columns=columns, viz_columns=viz_columns, spatial_columns=spatial_columns, start_time_date=start_time_date, start_time_clock=start_time_clock) return result