def save_doc(): global reading_index if request.method == 'POST': # get time spent editting in seconds time_spent_editting = time.time() - edit_start_time if len(request.form['changes_dict']) == 0: changes = {} else: changes = eval(request.form['changes_dict']) button = request.form['button'] print('changes', changes) # open document filetime = reading_times[reading_index] # open the already editted document if it exists to_save_to = editor_folder + os.sep + time_to_filename(filetime, extension='hocr') print('saving_to', to_save_to) if os.path.isfile(to_save_to): filepath = to_save_to else: filepath = sess.dir_name + os.sep + source_dirs[source_dir_index] + os.sep + time_to_filename(filetime, extension='hocr') doc = Document(filepath, output_dir=editor_folder) if 'seconds_spent_editting' in doc.attrs: time_spent_editting += eval(doc.attrs['seconds_spent_editting']) doc.attrs['seconds_spent_editting'] = time_spent_editting # record the path of the file that was editted if not os.path.isfile(to_save_to): doc.attrs['editted_from_path'] = filepath # get the words with the same order and filters as the page img_path = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + time_to_filename(filetime, extension='jpg') img = mpimg.imread(img_path) all_words = word_list(doc, img.shape) # make changes for id_key in changes: index = int(id_key) all_words[index].text = changes[id_key][0] all_words[index].attrs['highlight'] = changes[id_key][1] all_words[index].attrs['editted_by_human'] = 'True' # save changes doc.save() #iterate index if button == 'Next': if reading_index + 1 < len(reading_times): reading_index += 1 return redirect('/doc?reading_index={}'.format(reading_index)) else: return redirect('/') else: if reading_index > 0: reading_index -= 1 return redirect('/doc?reading_index={}'.format(reading_index)) else: return redirect('/') # if no data was sent, go home return redirect('/')
def get_mapping_function(sess, frame_document, static_list_of_lines, include_low_confidence=False, debug_mode=False): doc_lines = [l for l in frame_document.lines if l.attrs['updated_line'] != ''] if not include_low_confidence: doc_lines = doc_lines[1:-1] static_line_dict = dict([(line_text, (top, bottom)) for line_text, top, bottom in static_list_of_lines]) # don't count documents without sufficient lines if len(doc_lines) <= 1: return lambda x: None multipilers = [] adders = [] rows = [] # look at a bunch of line pairs for first_line_index in range(0, (len(doc_lines)-1)): first_line = doc_lines[first_line_index] for last_line_index in range((first_line_index + 1), len(doc_lines)): last_line = doc_lines[last_line_index] m, a = get_possible_map_values(first_line, last_line, static_line_dict) multipilers.append(m) adders.append(a) rows.append([first_line.attrs['updated_line'], last_line.attrs['updated_line'], m, a]) # save for debugging if debug_mode: df = pd.DataFrame(rows, columns=['first', 'last', 'multiplier', 'adder']) if not os.path.isdir(sess.dir_name + os.sep + settings.mapping_dir): os.mkdir(sess.dir_name + os.sep + settings.mapping_dir) df.to_csv(sess.dir_name + os.sep + settings.mapping_dir + os.sep + time_to_filename(frame_document.time, extension='csv'), index=False) # final values final_m = np.median(multipilers) final_a = np.median(adders) return lambda x: (x * final_m) + final_a
def get_last_frame_files(sess, dir_name='last_frames', part='digital reading'): # get the times for this session reading_times = [x for x in sess.metadata if x['part'] == part] reading_times = [t for reading_interval in reading_times for t in reading_interval['transitions']] if len(reading_times) == 0: return last_time = max(reading_times) # copy the hocr and image files from the last frame of this session to the directory path_to_hocr = sess.dir_name + os.sep + settings.global_id_dir + os.sep + time_to_filename(last_time, extension='hocr') path_to_image = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + time_to_filename(last_time, extension='jpg') sess_name = sess.id # make the dirrectory if it doesn't exits if not os.path.isdir(dir_name): os.mkdir(dir_name) # copy the files copyfile(path_to_hocr, dir_name + os.sep + sess_name + '.hocr') copyfile(path_to_image, dir_name + os.sep + sess_name + '.jpg')
def get_documents(sess, redo=False, alt_dir_name=None, part='digital reading', source_dir_name=settings.hocr_dir, edit_dir=None): # need the session directory path to all the documents dir_name = sess.dir_name + os.sep + source_dir_name if edit_dir is not None: edit_dir_name = sess.dir_name + os.sep + edit_dir # if there are no hocr files to clean, we should move on if not os.path.isdir(dir_name): return [] # get the times for this session reading_times = [x for x in sess.metadata if x['part'] == part] reading_times = [ t for reading_interval in reading_times for t in reading_interval['transitions'] ] # get the documents for this session documents = [] bad_filepaths = [] for time in reading_times: filename = time_to_filename(time, extension='hocr') if edit_dir is not None and os.path.isfile(edit_dir_name + os.sep + filename): filepath = edit_dir_name + os.sep + filename else: filepath = dir_name + os.sep + filename # don't re-calculate already finished files if not redo: alt_dir_name = alt_dir_name if alt_dir_name is not None else source_dir_name xml_path = sess.dir_name + os.sep + alt_dir_name + os.sep + filename if os.path.isfile(xml_path): continue # check to make sure the filepath is a valid document try: doc = Document(filepath, output_dir=alt_dir_name, output_dir_relative=True, time_in_seconds=filename_to_time(filepath)) documents.append(doc) except Exception as e: bad_filepaths.append(filepath) # get rid of any documents which don't have lines # print(out how many of these there are) have_lines = [ d for d in documents if (len(d.lines) > 0 and 'raised_error' not in d.attrs) ] if len(bad_filepaths) > 0 or len(have_lines) < len(documents): print( len(bad_filepaths) + len(documents) - len(have_lines), 'bad documents in', dir_name) documents = have_lines return documents
def run_tesseract(sess, redo=False, part='digital reading'): reading_times = [x for x in sess.metadata if x['part'] == part] for reading_interval in reading_times: for image_time in reading_interval['transitions']: filename = time_to_filename(image_time) image_dir = sess.dir_name + os.sep + settings.images_ready_for_ocr image_path = image_dir + os.sep + filename hocr_dir = sess.dir_name + os.sep + settings.hocr_dir if not os.path.isdir(hocr_dir): os.mkdir(hocr_dir) hocr_path = hocr_dir + os.sep + '.'.join(filename.split('.')[:-1]) run_tesseract_on_image(image_path, hocr_path, redo)
def edit_doc(): global edit_start_time, source_dir_index, reading_index if request.args.get('reading_index') is not None: try: index = int(request.args['reading_index']) except Exception as e: return redirect('/') if index < 0 or index >= len(reading_times): return redirect('/') reading_index = index print('reading_index', reading_index) print('reading_time', reading_times[reading_index]) print('highlights') for l in highlight_data[reading_times[reading_index]]: print(l) edit_start_time = time.time() filetime = reading_times[reading_index] # display the already editted document if it exists to_save_to = editor_folder + os.sep + time_to_filename(filetime, extension='hocr') if os.path.isfile(to_save_to): filepath = to_save_to else: source_dir_index = randint(0, len(source_dirs)-1) filepath = sess.dir_name + os.sep + source_dirs[source_dir_index] + os.sep + time_to_filename(filetime, extension='hocr') print('serving from', filepath) doc = Document(filepath) img_path = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + time_to_filename(filetime, extension='jpg') img = mpimg.imread(img_path) img_save_path = 'static/imgs/' + sess.id + '_' + time_to_filename(filetime, extension='jpg') mpimg.imsave(img_save_path, img) # get the color hex numbers color_mapping = [[k, rgb2hex(highlight_color_pairs[k][0]), rgb2hex(highlight_color_pairs[k][1])] for k in highlight_color_pairs] # data data = {'wordInfo': word_list_as_json(doc, img.shape), 'imgSize': list(img.shape), 'colors': color_mapping, 'imgPath':img_save_path} return render_template('view_doc.html', data=data)
def make_ocr_ready_images(sess, redo=False, part='digital reading', cutoff_parts=True): reading_times = [x for x in sess.metadata if x['part'] == part] for reading_interval in reading_times: for image_time in reading_interval['transitions']: filename = time_to_filename(image_time) full_path = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + filename # check if we already made this image # (this script may be run multiple times if it didn't finish) # saving on image creation and running tesseract will help a lot with time dir_for_bigger_images = sess.dir_name + os.sep + settings.images_ready_for_ocr if not os.path.isdir(dir_for_bigger_images): os.mkdir(dir_for_bigger_images) full_path_for_new_image = dir_for_bigger_images + os.sep + filename resize_image(full_path, full_path_for_new_image, redo=redo, part=part, cutoff_parts=cutoff_parts)
def calculate_scrolling(sess, part='digital reading', redo=False, save=True, include_low_confidence=False): # load if already calculated if redo is False: filepath = sess.dir_name + os.sep + settings.scrolling_csv_file if os.path.isfile(filepath): return pd.read_csv(filepath) # get the lines as a single list static_line_list = stitch_lines(sess, part=part) # get the list of documents documents = get_documents(sess, redo=True, alt_dir_name=None, part=part, source_dir_name=settings.xml_dir) # store all the output as rows of time, top_of_frame, bottom_of_frame top_value = None bottom_value = None rows = [] # retrieve the time and top of page values for each document for doc in documents: t = doc.time # get dimensions of an example frame if top_value is None: frame_filepath = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + time_to_filename(t) img = np.array(misc.imread(frame_filepath)) # not unscalling properly right now bottom_value = img.shape[0] top_value = 0 mapping_function = get_mapping_function(sess, doc, static_line_list, include_low_confidence=include_low_confidence) y_top = mapping_function(top_value) y_bottom = mapping_function(bottom_value) # filter out unlikely scrolls (those with less than 4 lines recognized) # these times will be replaced with the times before them # (they are most likely to be times when the document is outside the corpus or the screen is very zoomed) if y_top is None: continue rows.append((t, y_top, y_bottom)) df = pd.DataFrame(rows, columns=['Time', 'Top', 'Bottom']) if save: df.to_csv(sess.dir_name + os.sep + settings.scrolling_csv_file, index=False) return df
def visualize_scrolling(sess, part='digital reading', picture_directory = None, include_labels=True, save_and_clear=True): # get the scrolling data data = list(calculate_scrolling(sess, part=part).apply(lambda x: (x['Time'], x['Top'], x['Bottom']), axis=1)) if len(data) == 0: return # expand the data to cover the full time period for each frame data += [(data[i][0]-.05, data[i-1][1], data[i-1][2]) for i in range(1,len(data))] data.sort() x_vals, y_top_vals, y_bottom_vals = zip(*data) # for scaling # older idea: scale according to static lines # problem: this list is scaled arbitrarily by whatever scale the first line is chosen to be # static_lines = stitch_lines(sess, part=part) # final_line_difference = static_lines[-1][2] - static_lines[-2][2] # currently this is just the maximum value measured # would like to fix so the viz still works if the user didn't scroll to the bottom max_height = max(y_bottom_vals) # if there is a picture, load it and scale the image values if picture_directory is not None: # get the picture name first_reading_time = data[0][0] # get a sample document to find the correction document dir_name = sess.dir_name + os.sep + settings.highlights_dir sample_doc_path = dir_name + os.sep + time_to_filename(first_reading_time, extension='hocr') sample_doc = Document(sample_doc_path, output_dir=None) correction_filename = sample_doc.correct_filepath.split(os.sep)[-1] # get the picture associated with the correction document picture_directory = picture_directory if picture_directory.endswith(os.sep) else picture_directory + os.sep image_path = picture_directory + correction_filename[:-len('.txt')] + '.png' img = np.array(misc.imread(image_path)) height, width, _ = img.shape plt.imshow(img) # scale the plot to fit over the image plt.xlim([0, width]) max_width = width plt.ylim([height, 0]) # save instructions for unscaling unscale_x = lambda x: (x/width*(max(x_vals)-min(x_vals)))+min(x_vals) # scale values x_scaled_vals = [(x-min(x_vals))/(max(x_vals)-min(x_vals))*width for x in x_vals] y_scaled_top_vals = [y/max_height*height for y in y_top_vals] y_scaled_bottom_vals = [y/max_height*height for y in y_bottom_vals] # plot data on top plt.fill_between(x_scaled_vals, y_scaled_top_vals, y_scaled_bottom_vals, alpha=.2) else: unscale_x = lambda x: x # scale the plot to fit over the image plt.xlim([0, max(x_vals)]) max_width = max(x_vals) plt.ylim([max_height, 0]) # plot data on top plt.fill_between(x_vals, y_top_vals, y_bottom_vals, alpha=.2) # add the labels if include_labels: plt.xlabel('Time Since Session Started') plt.ylabel('Vertical Position of Screen') # fix x ticks x_tick_vals = [x for x in plt.xticks()[0] if x <= max_width] x_tick_vals_unscaled = [unscale_x(x) for x in x_tick_vals] num_to_str = lambda num: str(int(num)) if num >= 10 else '0'+str(int(num)) to_time_str = lambda t: num_to_str(int(t/60)) + ':' + num_to_str(t-(int(t/60)*60)) plt.xticks(x_tick_vals, [to_time_str(xt) for xt in x_tick_vals_unscaled]) # get rid of y ticks plt.yticks([], []) if save_and_clear: plt.savefig(sess.dir_name + os.sep + settings.scrolling_viz_file, dpi=800) # make sure there is nothing still in the figure plt.clf()
def make_explanation_frames_for_session(sess, part='digital reading'): # get the largest reading time # (in the future it might be good to make multiple visualizations) reading_times = max( [x for x in sess.metadata if x['part'] == part], key=lambda x: max(x['transitions']) - min(x['transitions'])) reading_times = reading_times['transitions'] if len(reading_times) == 0: return reading_times.sort() all_times = list( range(int(reading_times[0] / settings.little_t), int(reading_times[-1] / settings.little_t))) length_of_indexes = len(str(len(all_times))) # get all the frame times that frames exist for not just reading times (this will make the video less choppy) frame_times = [ filename_to_time(f) for f in os.listdir(sess.dir_name + os.sep + settings.frame_images_dir) if not f.startswith('.') ] frame_times.sort() # set the max and min times to scale the time appropriately for the scrolling visual min_time = reading_times[0] max_time = reading_times[-1] # a function to turn a time into an index def index_to_filename(index): output = str(index) while len(output) < length_of_indexes: output = '0' + output return output + '.png' # the indexes for the two types of frames explanatory_frame_index = 0 frame_times_index = 0 # go through the times and make frames for the explanation video for time_div_little_t in all_times: # set the time time = time_div_little_t * settings.little_t # increment the index to the frames if applicable if (frame_times_index + 1) < len(frame_times) and frame_times[ frame_times_index + 1] <= time: frame_times_index += 1 # set the filenames for the explanatory frame and the screen shot frame series_filename = index_to_filename(explanatory_frame_index) frame_filename = time_to_filename(frame_times[frame_times_index]) # make the explanatory frame make_explanation_frame(sess, time, frame_filename, series_filename, min_time, max_time, part=part) # increment the explanatory frame index explanatory_frame_index += 1
def make_highlight_viz(sess, words_per_label=3, part='digital reading', save_and_clear=True, include_labels=True): matrix = get_highlight_visualization_matrix(sess, part=part) max_height, max_width, _ = matrix.shape plt.imshow(matrix) # set the x ticks (words) # get the largest reading time # (in the future it might be good to make multiple visualizations) reading_times = max( [x for x in sess.metadata if x['part'] == part], key=lambda x: max(x['transitions']) - min(x['transitions'])) reading_times = reading_times['transitions'] if len(reading_times) == 0: return # get a sample document to find the correction document dir_name = sess.dir_name + os.sep + settings.highlights_dir sample_doc_path = dir_name + os.sep + time_to_filename(reading_times[0], extension='hocr') sample_doc = Document(sample_doc_path, output_dir=None) correction_filename = sample_doc.correct_filepath.split(os.sep)[-1] # get the word labels word_labels = get_word_labels(correction_filename) # map the pixel values to words to get the labels word_lengths = [len(w) for w in word_labels] char_index_to_word_index_mapping = {} word_index = 0 for char_index in range(sum(word_lengths)): if sum(word_lengths[:(word_index + 1)]) < char_index: word_index += 1 char_index_to_word_index_mapping[char_index] = word_index x_tick_vals = [x for x in plt.xticks()[0] if (x >= 0) and (x <= max_width)] x_tick_labels = [] for x_tick in x_tick_vals: word_index = char_index_to_word_index_mapping[x_tick] last_index = min([word_index + words_per_label, len(word_labels)]) x_tick_labels.append(' '.join(word_labels[word_index:last_index])) plt.xticks(x_tick_vals, x_tick_labels, rotation=15, fontsize=5) # set the y ticks (time) y_tick_vals = [ y for y in plt.yticks()[0] if (y >= 0) and (y <= max_height) ] num_to_str = lambda num: str(int(num)) if num >= 10 else '0' + str(int(num) ) to_time_str = lambda t: num_to_str(int(t / 60)) + ':' + num_to_str(t - ( int(t / 60) * 60)) plt.yticks(y_tick_vals, [to_time_str(yt * settings.little_t) for yt in y_tick_vals]) if include_labels: plt.xlabel('Article Text') plt.ylabel('Time') # add space for labels and tick marks plt.tight_layout() if save_and_clear: plt.savefig(sess.dir_name + os.sep + settings.highlighting_viz_file, dpi=800) plt.clf()
def get_highlight_visualization_matrix(sess, part='digital reading', redo=False): # load from image if not redoing and it exists if not redo and os.path.isfile(sess.dir_name + os.sep + settings.highlighting_image_file): return np.array( Image.open(sess.dir_name + os.sep + settings.highlighting_image_file)) # get the largest reading time # (in the future it might be good to make multiple visualizations) reading_times = max( [x for x in sess.metadata if x['part'] == part], key=lambda x: max(x['transitions']) - min(x['transitions'])) reading_times = reading_times['transitions'] if len(reading_times) == 0: return # get a sample document to find the correction document dir_name = sess.dir_name + os.sep + settings.highlights_dir sample_doc_path = dir_name + os.sep + time_to_filename(reading_times[0], extension='hocr') sample_doc = Document(sample_doc_path, output_dir=None) correction_filename = sample_doc.correct_filepath.split(os.sep)[-1] # get the lengths of all words word_lengths = get_word_lengths(correction_filename) total_word_length = sum(word_lengths) # get highlighting report data = get_user_data(sess) # calculate mapping from (time, word) -> color min_time = int(min(reading_times) / settings.little_t) max_time = int(max(reading_times) / settings.little_t) mapping = {} for time in data: for word_obj in data[time]: for w_id in word_obj['id_group']: mapping[(int(float(time) * 10), int(w_id))] = word_obj['color'] # the output image (initalize to zeros) # this helps catch bugs as long as black is not one of the highlight colors matrix = np.zeros(((max_time - min_time), total_word_length, 3), 'uint8') for row in range((max_time - min_time)): for word_id in range(len(word_lengths)): color_string = get_color(row + min_time, word_id, mapping, min_time) # get the pixel start and end of a word word_start = 0 if word_id == 0 else sum(word_lengths[:word_id]) word_end = word_start + word_lengths[word_id] + 1 for i in range(3): matrix[row, word_start:word_end, i] = settings.highlight_viz_colors[color_string][i] # save for future calls img = Image.fromarray(matrix).convert('RGB') img.save(sess.dir_name + os.sep + settings.highlighting_image_file) # return the image return matrix
def get_num_highlights(filetime): filepath = sess.dir_name + os.sep + source_dirs[0] + os.sep + time_to_filename(filetime, extension='hocr') doc = Document(filepath, calc_width=False) highlights = Counter([w.attrs['highlight'] for l in doc.lines for w in l.children if len(w.text) > 0]) return highlights