def make_highlighting_report_for_each_session(part='digital reading', redo=False, use_edit_dir=True): # get the session names session_names = get_session_names() for sess_name in session_names: sess = Session(sess_name) # avoid sessions where the corrections have not been completely calculated if not os.path.isfile(sess.dir_name + os.sep + 'time_to_find_highlights.json'): continue # don't recalculate if not redo and os.path.isfile(sess.dir_name + os.sep + 'time_to_make_highlight_report.json'): continue time_to_find_highlights = {} t0 = time.time() report = make_report(sess, part=part, use_edit_dir=use_edit_dir) with open(sess.dir_name + os.sep + settings.highlighting_report, 'w') as fp: json.dump(report, fp, indent=4, sort_keys=False) time_to_find_highlights['make_report'] = time.time() - t0 with open( sess.dir_name + os.sep + 'time_to_make_highlight_report.json', 'w') as fp: json.dump(time_to_find_highlights, fp)
def find_highlights_for_each_session(no_bolding=False, img_dir_path=settings.frame_images_dir, part='digital reading', dir_to_read_from=settings.global_id_dir, dir_to_save_to=settings.highlights_dir, recalculate=False): # get the session names session_names = get_session_names() for sess_name in session_names: sess = Session(sess_name) # avoid sessions where the corrections have not been completely calculated if not os.path.isfile(sess.dir_name + os.sep + 'time_to_assign_ids.json'): continue # don't recalculate if not recalculate and os.path.isfile(sess.dir_name + os.sep + 'time_to_find_highlights.json'): continue time_to_find_highlights = {} t0 = time.time() find_all_highlights(sess, part=part, no_bolding=no_bolding, img_dir_path=img_dir_path, dir_to_read_from=dir_to_read_from, dir_to_save_to=dir_to_save_to, recalculate=recalculate) time_to_find_highlights['find_all_highlights'] = time.time() - t0 with open(sess.dir_name + os.sep + 'time_to_find_highlights.json', 'w') as fp: json.dump(time_to_find_highlights, fp)
def assign_global_ids_to_each_session(redo=False, part='digital reading', error_dir=settings.error_dir): # get the session names session_names = get_session_names() for sess_name in session_names: sess = Session(sess_name) # avoid sessions where the corrections have not been completely calculated if not os.path.isfile(sess.dir_name + os.sep + 'time_to_cleanup_ocr.json'): continue # don't recalculate if not redo and os.path.isfile(sess.dir_name + os.sep + 'time_to_assign_ids.json'): continue time_to_assign_ids = {} t0 = time.time() assign_global_ids_from_correct_file(sess, part=part, redo=redo, error_dir=error_dir) time_to_assign_ids['assign_global_ids_from_correct_file'] = time.time( ) - t0 with open(sess.dir_name + os.sep + 'time_to_assign_ids.json', 'w') as fp: json.dump(time_to_assign_ids, fp)
def run_initial_ocr_and_time_on_each_session(redo=False, cutoff_parts=True): session_names = get_session_names() for sess_name in session_names: sess = Session(sess_name) if not redo and os.path.isfile(sess.dir_name + os.sep + 'time_to_run_initial_ocr.json'): continue time_to_build = run_initial_ocr_and_time(sess, cutoff_parts=cutoff_parts) with open(sess.dir_name + os.sep + 'time_to_run_initial_ocr.json', 'w') as fp: json.dump(time_to_build, fp)
def assign_global_ids_to_corrected_files_for_each_session(): # get the session names session_names = get_session_names() for sess_name in session_names: sess = Session(sess_name) # assign the global ids assign_global_ids_from_correct_file( sess, part='digital reading', redo=True, error_dir=settings.error_dir, source_dir_name=settings.editor_dir, alt_dir_name=settings.editor_dir, start_from_scratch=True)
def make_highlighting_images(part='digital reading', redo=False): session_names = get_session_names() for sess_name in session_names: sess = Session(sess_name) # avoid sessions where there is no highlighting report if not os.path.isfile(sess.dir_name + os.sep + settings.highlighting_report): continue # don't recalculate if not redo and os.path.isfile(sess.dir_name + os.sep + 'time_to_make_highlight_image.json'): continue time_to_make_matrix = {} t0 = time.time() get_highlight_visualization_matrix(sess, part=part, redo=redo) time_to_make_matrix['make_matrix'] = time.time() - t0 with open(sess.dir_name + os.sep + 'time_to_make_highlight_image.json', 'w') as fp: json.dump(time_to_make_matrix, fp)
def get_reading_times(): all_sesion_names = get_session_names() all_times = [] for sess_name in all_sesion_names: sess = Session(sess_name) reading_times = [x for x in sess.metadata if x['part'] == 'digital reading'] if len(reading_times) == 0: start_time = None end_time = None else: start_time = min([x['start_time'] for x in reading_times]) end_time = max([x['end_time'] for x in reading_times]) all_times.append([sess_name, start_time, end_time]) with open('reading_times.csv', 'w') as outfile: writer = csv.writer(outfile) writer.writerow(['sess_name', 'start_time', 'end_time']) for row in all_times: writer.writerow(row)
def run_cleanup_session_and_time_on_each_session(redo=False, part='digital reading'): # get the correct bag names correct_bags = get_correct_bags() word_to_doc = make_matching_dictionary(correct_bags) # get the session names session_names = get_session_names() for sess_name in session_names: sess = Session(sess_name) if not redo and os.path.isfile(sess.dir_name + os.sep + 'time_to_cleanup_ocr.json'): continue time_to_cleanup = cleanup_session(sess, correct_bags, word_to_doc, redo=redo, stop_at_lines=False, alt_dir_name=None, part=part) with open(sess.dir_name + os.sep + 'time_to_cleanup_ocr.json', 'w') as fp: json.dump(time_to_cleanup, fp)
y_bottom - y_top, facecolor='b', alpha=1.0 / len(data)) plt.add_patch(rect) # scale the plot to fit over the image plt.xlim([0, width]) plt.ylim([height, 0]) # get rid of y ticks plt.yticks([], []) plt.xticks([], []) plt.savefig(sess.dir_name + os.sep + 'dwell_heatmap.png', dpi=800) if __name__ == '__main__': for sess_name in get_session_names(): print(sess_name) sess = Session(sess_name) plot_scroll(sess, xml_dir_extention=None, start_time=None, end_time=None, reset_time=False, filename='scrolling.png') if not os.path.isdir(sess.dir_name + os.sep + 'hocr-files'): print('no hocr') continue if not os.path.isdir(sess.dir_name + os.sep + 'xml-files'): print('no xml') continue hocr_files = len(os.listdir(sess.dir_name + os.sep + 'hocr-files'))
def save_scrolling_for_all_sessions(part='digital reading'): for sess_name in get_session_names(): sess = Session(sess_name) calculate_scrolling(sess, part=part, redo=True)
for filename in os.listdir(dir_name): if filename.startswith('.'): continue length_of_zeros = len(filename[:filename.find('.')]) break # if there is nothing in the frames directory, exit if length_of_zeros is None: return # set the matching string frame_matching_string = dir_name + '%0' + str(length_of_zeros) + 'd.png' command = [ 'ffmpeg', '-r', str(1 / settings.little_t), '-i', frame_matching_string, '-vcodec', 'mpeg4', '-y', sess.dir_name + os.sep + settings.explanation_video ] # build the video subprocess.call(command) if __name__ == '__main__': session_names = get_session_names() # only make an explanation for the first session # be aware this takes a VERY long time per video for sess_name in session_names[:1]: sess = Session(sess_name) # make a visualization make_explanation_frames_for_session(sess) make_explanation_video(sess)
def make_highlighting_viz_for_all_sessions(part='digital reading'): session_names = get_session_names() for sess_name in session_names: sess = Session(sess_name) # make a visualization make_highlight_viz(sess, part=part)