Ejemplo n.º 1
0
def save_doc():
	global reading_index
	if request.method == 'POST':
		# get time spent editting in seconds
		time_spent_editting = time.time() - edit_start_time

		if len(request.form['changes_dict']) == 0:
			changes = {}
		else:
			changes = eval(request.form['changes_dict'])
		button = request.form['button']

		print('changes', changes)

		# open document
		filetime = reading_times[reading_index]
		# open the already editted document if it exists
		to_save_to = editor_folder + os.sep + time_to_filename(filetime, extension='hocr')
		print('saving_to', to_save_to)
		if os.path.isfile(to_save_to):
			filepath = to_save_to
		else:
			filepath = sess.dir_name + os.sep + source_dirs[source_dir_index] + os.sep + time_to_filename(filetime, extension='hocr')
		doc = Document(filepath, output_dir=editor_folder)
		if 'seconds_spent_editting' in doc.attrs:
			time_spent_editting += eval(doc.attrs['seconds_spent_editting'])
		doc.attrs['seconds_spent_editting'] = time_spent_editting
		# record the path of the file that was editted
		if not os.path.isfile(to_save_to):
			doc.attrs['editted_from_path'] = filepath

		# get the words with the same order and filters as the page
		img_path = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + time_to_filename(filetime, extension='jpg')
		img = mpimg.imread(img_path)
		all_words = word_list(doc, img.shape)

		# make changes
		for id_key in changes:
			index = int(id_key)
			all_words[index].text = changes[id_key][0]
			all_words[index].attrs['highlight'] = changes[id_key][1]
			all_words[index].attrs['editted_by_human'] = 'True'
		# save changes
		doc.save()

		#iterate index
		if button == 'Next':
			if reading_index + 1 < len(reading_times):
				reading_index += 1
				return redirect('/doc?reading_index={}'.format(reading_index))
			else:
				return redirect('/')
		else:
			if reading_index > 0:
				reading_index -= 1
				return redirect('/doc?reading_index={}'.format(reading_index))
			else:
				return redirect('/')
	# if no data was sent, go home
	return redirect('/')
Ejemplo n.º 2
0
def get_mapping_function(sess, frame_document, static_list_of_lines, include_low_confidence=False, debug_mode=False):
	doc_lines = [l for l in frame_document.lines if l.attrs['updated_line'] != '']
	if not include_low_confidence:
		doc_lines = doc_lines[1:-1]
	static_line_dict = dict([(line_text, (top, bottom)) for line_text, top, bottom in static_list_of_lines])
	# don't count documents without sufficient lines
	if len(doc_lines) <= 1:
		return lambda x: None
	multipilers = []
	adders = []
	rows = []
	# look at a bunch of line pairs
	for first_line_index in range(0, (len(doc_lines)-1)):
		first_line = doc_lines[first_line_index]
		for last_line_index in range((first_line_index + 1), len(doc_lines)):
			last_line = doc_lines[last_line_index]
			m, a = get_possible_map_values(first_line, last_line, static_line_dict)
			multipilers.append(m)
			adders.append(a)
			rows.append([first_line.attrs['updated_line'], last_line.attrs['updated_line'], m, a])
	# save for debugging
	if debug_mode:
		df = pd.DataFrame(rows, columns=['first', 'last', 'multiplier', 'adder'])
		if not os.path.isdir(sess.dir_name + os.sep + settings.mapping_dir):
			os.mkdir(sess.dir_name + os.sep + settings.mapping_dir)
		df.to_csv(sess.dir_name + os.sep + settings.mapping_dir + os.sep + time_to_filename(frame_document.time, extension='csv'), index=False)
	# final values
	final_m = np.median(multipilers)
	final_a = np.median(adders)
	return lambda x: (x * final_m) + final_a
Ejemplo n.º 3
0
def get_last_frame_files(sess, dir_name='last_frames', part='digital reading'):
	# get the times for this session
	reading_times = [x for x in sess.metadata if x['part'] == part]
	reading_times = [t for reading_interval in reading_times for t in reading_interval['transitions']]
	if len(reading_times) == 0:
		return
	last_time = max(reading_times)
	# copy the hocr and image files from the last frame of this session to the directory
	path_to_hocr = sess.dir_name + os.sep + settings.global_id_dir + os.sep + time_to_filename(last_time, extension='hocr')
	path_to_image = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + time_to_filename(last_time, extension='jpg')
	sess_name = sess.id
	# make the dirrectory if it doesn't exits
	if not os.path.isdir(dir_name):
		os.mkdir(dir_name)
	# copy the files
	copyfile(path_to_hocr, dir_name + os.sep + sess_name + '.hocr')
	copyfile(path_to_image, dir_name + os.sep + sess_name + '.jpg')
Ejemplo n.º 4
0
def get_documents(sess,
                  redo=False,
                  alt_dir_name=None,
                  part='digital reading',
                  source_dir_name=settings.hocr_dir,
                  edit_dir=None):
    # need the session directory path to all the documents
    dir_name = sess.dir_name + os.sep + source_dir_name
    if edit_dir is not None:
        edit_dir_name = sess.dir_name + os.sep + edit_dir
    # if there are no hocr files to clean, we should move on
    if not os.path.isdir(dir_name):
        return []
    # get the times for this session
    reading_times = [x for x in sess.metadata if x['part'] == part]
    reading_times = [
        t for reading_interval in reading_times
        for t in reading_interval['transitions']
    ]
    # get the documents for this session
    documents = []
    bad_filepaths = []
    for time in reading_times:
        filename = time_to_filename(time, extension='hocr')
        if edit_dir is not None and os.path.isfile(edit_dir_name + os.sep +
                                                   filename):
            filepath = edit_dir_name + os.sep + filename
        else:
            filepath = dir_name + os.sep + filename
        # don't re-calculate already finished files
        if not redo:
            alt_dir_name = alt_dir_name if alt_dir_name is not None else source_dir_name
            xml_path = sess.dir_name + os.sep + alt_dir_name + os.sep + filename
            if os.path.isfile(xml_path):
                continue
        # check to make sure the filepath is a valid document
        try:
            doc = Document(filepath,
                           output_dir=alt_dir_name,
                           output_dir_relative=True,
                           time_in_seconds=filename_to_time(filepath))
            documents.append(doc)
        except Exception as e:
            bad_filepaths.append(filepath)
    # get rid of any documents which don't have lines
    # print(out how many of these there are)
    have_lines = [
        d for d in documents
        if (len(d.lines) > 0 and 'raised_error' not in d.attrs)
    ]
    if len(bad_filepaths) > 0 or len(have_lines) < len(documents):
        print(
            len(bad_filepaths) + len(documents) - len(have_lines),
            'bad documents in', dir_name)
        documents = have_lines
    return documents
Ejemplo n.º 5
0
def run_tesseract(sess, redo=False, part='digital reading'):
    reading_times = [x for x in sess.metadata if x['part'] == part]
    for reading_interval in reading_times:
        for image_time in reading_interval['transitions']:
            filename = time_to_filename(image_time)
            image_dir = sess.dir_name + os.sep + settings.images_ready_for_ocr
            image_path = image_dir + os.sep + filename
            hocr_dir = sess.dir_name + os.sep + settings.hocr_dir
            if not os.path.isdir(hocr_dir):
                os.mkdir(hocr_dir)
            hocr_path = hocr_dir + os.sep + '.'.join(filename.split('.')[:-1])
            run_tesseract_on_image(image_path, hocr_path, redo)
Ejemplo n.º 6
0
def edit_doc():
	global edit_start_time, source_dir_index, reading_index
	if request.args.get('reading_index') is not None:
		try:
			index = int(request.args['reading_index'])
		except Exception as e:
			return redirect('/')
		if index < 0 or index >= len(reading_times):
			return redirect('/')
		reading_index = index
	print('reading_index', reading_index)
	print('reading_time', reading_times[reading_index])
	print('highlights')
	for l in highlight_data[reading_times[reading_index]]:
		print(l)
	edit_start_time = time.time()
	filetime = reading_times[reading_index]
	# display the already editted document if it exists
	to_save_to = editor_folder + os.sep + time_to_filename(filetime, extension='hocr')
	if os.path.isfile(to_save_to):
		filepath = to_save_to
	else:
		source_dir_index = randint(0, len(source_dirs)-1)
		filepath = sess.dir_name + os.sep + source_dirs[source_dir_index] + os.sep + time_to_filename(filetime, extension='hocr')
	print('serving from', filepath)
	doc = Document(filepath)
	img_path = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + time_to_filename(filetime, extension='jpg')
	img = mpimg.imread(img_path)
	img_save_path = 'static/imgs/' + sess.id + '_' + time_to_filename(filetime, extension='jpg')
	mpimg.imsave(img_save_path, img)

	# get the color hex numbers
	color_mapping = [[k, rgb2hex(highlight_color_pairs[k][0]), rgb2hex(highlight_color_pairs[k][1])] for k in highlight_color_pairs]

	# data
	data = {'wordInfo': word_list_as_json(doc, img.shape), 'imgSize': list(img.shape), 'colors': color_mapping, 'imgPath':img_save_path}

	return render_template('view_doc.html', data=data)
Ejemplo n.º 7
0
def make_ocr_ready_images(sess,
                          redo=False,
                          part='digital reading',
                          cutoff_parts=True):
    reading_times = [x for x in sess.metadata if x['part'] == part]
    for reading_interval in reading_times:
        for image_time in reading_interval['transitions']:
            filename = time_to_filename(image_time)
            full_path = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + filename
            # check if we already made this image
            # (this script may be run multiple times if it didn't finish)
            # saving on image creation and running tesseract will help a lot with time
            dir_for_bigger_images = sess.dir_name + os.sep + settings.images_ready_for_ocr
            if not os.path.isdir(dir_for_bigger_images):
                os.mkdir(dir_for_bigger_images)
            full_path_for_new_image = dir_for_bigger_images + os.sep + filename
            resize_image(full_path,
                         full_path_for_new_image,
                         redo=redo,
                         part=part,
                         cutoff_parts=cutoff_parts)
Ejemplo n.º 8
0
def calculate_scrolling(sess, part='digital reading', redo=False, save=True, include_low_confidence=False):
	# load if already calculated
	if redo is False:
		filepath = sess.dir_name + os.sep + settings.scrolling_csv_file
		if os.path.isfile(filepath):
			return pd.read_csv(filepath)
	# get the lines as a single list
	static_line_list = stitch_lines(sess, part=part)
	# get the list of documents
	documents = get_documents(sess, redo=True, alt_dir_name=None, part=part, source_dir_name=settings.xml_dir)
	# store all the output as rows of time, top_of_frame, bottom_of_frame
	top_value = None
	bottom_value = None
	rows = []
	# retrieve the time and top of page values for each document
	for doc in documents:
		t = doc.time
		# get dimensions of an example frame
		if top_value is None:
			frame_filepath = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + time_to_filename(t)
			img = np.array(misc.imread(frame_filepath))
			# not unscalling properly right now
			bottom_value = img.shape[0]
			top_value = 0
		mapping_function = get_mapping_function(sess, doc, static_line_list, include_low_confidence=include_low_confidence)
		y_top = mapping_function(top_value)
		y_bottom = mapping_function(bottom_value)
		# filter out unlikely scrolls (those with less than 4 lines recognized)
		# these times will be replaced with the times before them
		# (they are most likely to be times when the document is outside the corpus or the screen is very zoomed)
		if y_top is None:
			continue
		rows.append((t, y_top, y_bottom))
	df = pd.DataFrame(rows, columns=['Time', 'Top', 'Bottom'])
	if save:
		df.to_csv(sess.dir_name + os.sep + settings.scrolling_csv_file, index=False)
	return df
Ejemplo n.º 9
0
def visualize_scrolling(sess, part='digital reading', picture_directory = None, include_labels=True, save_and_clear=True):
	# get the scrolling data
	data = list(calculate_scrolling(sess, part=part).apply(lambda x: (x['Time'], x['Top'], x['Bottom']), axis=1))
	if len(data) == 0:
		return
	# expand the data to cover the full time period for each frame
	data += [(data[i][0]-.05, data[i-1][1], data[i-1][2]) for i in range(1,len(data))]
	data.sort()
	x_vals, y_top_vals, y_bottom_vals = zip(*data)
	# for scaling

	# older idea: scale according to static lines
	# problem: this list is scaled arbitrarily by whatever scale the first line is chosen to be
	# static_lines = stitch_lines(sess, part=part)
	# final_line_difference = static_lines[-1][2] - static_lines[-2][2]

	# currently this is just the maximum value measured
	# would like to fix so the viz still works if the user didn't scroll to the bottom
	max_height = max(y_bottom_vals)
	# if there is a picture, load it and scale the image values
	if picture_directory is not None:

		# get the picture name
		first_reading_time = data[0][0]

		# get a sample document to find the correction document
		dir_name = sess.dir_name + os.sep + settings.highlights_dir
		sample_doc_path = dir_name + os.sep + time_to_filename(first_reading_time, extension='hocr')
		sample_doc = Document(sample_doc_path, output_dir=None)
		correction_filename = sample_doc.correct_filepath.split(os.sep)[-1]

		# get the picture associated with the correction document
		picture_directory = picture_directory if picture_directory.endswith(os.sep) else picture_directory + os.sep
		image_path = picture_directory + correction_filename[:-len('.txt')] + '.png'
		img = np.array(misc.imread(image_path))
		height, width, _ = img.shape
		plt.imshow(img)

		# scale the plot to fit over the image
		plt.xlim([0, width])
		max_width = width
		plt.ylim([height, 0])

		# save instructions for unscaling
		unscale_x = lambda x: (x/width*(max(x_vals)-min(x_vals)))+min(x_vals)

		# scale values
		x_scaled_vals = [(x-min(x_vals))/(max(x_vals)-min(x_vals))*width for x in x_vals]
		y_scaled_top_vals = [y/max_height*height for y in y_top_vals]
		y_scaled_bottom_vals = [y/max_height*height for y in y_bottom_vals]

		# plot data on top
		plt.fill_between(x_scaled_vals, y_scaled_top_vals, y_scaled_bottom_vals, alpha=.2)
	else:
		unscale_x = lambda x: x
		# scale the plot to fit over the image
		plt.xlim([0, max(x_vals)])
		max_width = max(x_vals)
		plt.ylim([max_height, 0])
		# plot data on top
		plt.fill_between(x_vals, y_top_vals, y_bottom_vals, alpha=.2)

	# add the labels
	if include_labels:
		plt.xlabel('Time Since Session Started')
		plt.ylabel('Vertical Position of Screen')
	# fix x ticks
	x_tick_vals = [x for x in plt.xticks()[0] if x <= max_width]
	x_tick_vals_unscaled = [unscale_x(x) for x in x_tick_vals]
	num_to_str = lambda num: str(int(num)) if num >= 10 else '0'+str(int(num))
	to_time_str = lambda t: num_to_str(int(t/60)) + ':' + num_to_str(t-(int(t/60)*60))
	plt.xticks(x_tick_vals, [to_time_str(xt) for xt in x_tick_vals_unscaled])
	# get rid of y ticks
	plt.yticks([], [])

	if save_and_clear:
		plt.savefig(sess.dir_name + os.sep + settings.scrolling_viz_file, dpi=800)
		# make sure there is nothing still in the figure
		plt.clf()
Ejemplo n.º 10
0
def make_explanation_frames_for_session(sess, part='digital reading'):
    # get the largest reading time
    # (in the future it might be good to make multiple visualizations)
    reading_times = max(
        [x for x in sess.metadata if x['part'] == part],
        key=lambda x: max(x['transitions']) - min(x['transitions']))
    reading_times = reading_times['transitions']

    if len(reading_times) == 0:
        return

    reading_times.sort()
    all_times = list(
        range(int(reading_times[0] / settings.little_t),
              int(reading_times[-1] / settings.little_t)))
    length_of_indexes = len(str(len(all_times)))

    # get all the frame times that frames exist for not just reading times (this will make the video less choppy)
    frame_times = [
        filename_to_time(f)
        for f in os.listdir(sess.dir_name + os.sep + settings.frame_images_dir)
        if not f.startswith('.')
    ]
    frame_times.sort()

    # set the max and min times to scale the time appropriately for the scrolling visual
    min_time = reading_times[0]
    max_time = reading_times[-1]

    # a function to turn a time into an index
    def index_to_filename(index):
        output = str(index)
        while len(output) < length_of_indexes:
            output = '0' + output
        return output + '.png'

    # the indexes for the two types of frames
    explanatory_frame_index = 0
    frame_times_index = 0

    # go through the times and make frames for the explanation video
    for time_div_little_t in all_times:
        # set the time
        time = time_div_little_t * settings.little_t

        # increment the index to the frames if applicable
        if (frame_times_index + 1) < len(frame_times) and frame_times[
                frame_times_index + 1] <= time:
            frame_times_index += 1

        # set the filenames for the explanatory frame and the screen shot frame
        series_filename = index_to_filename(explanatory_frame_index)
        frame_filename = time_to_filename(frame_times[frame_times_index])

        # make the explanatory frame
        make_explanation_frame(sess,
                               time,
                               frame_filename,
                               series_filename,
                               min_time,
                               max_time,
                               part=part)

        # increment the explanatory frame index
        explanatory_frame_index += 1
Ejemplo n.º 11
0
def make_highlight_viz(sess,
                       words_per_label=3,
                       part='digital reading',
                       save_and_clear=True,
                       include_labels=True):
    matrix = get_highlight_visualization_matrix(sess, part=part)
    max_height, max_width, _ = matrix.shape
    plt.imshow(matrix)

    # set the x ticks (words)
    # get the largest reading time
    # (in the future it might be good to make multiple visualizations)
    reading_times = max(
        [x for x in sess.metadata if x['part'] == part],
        key=lambda x: max(x['transitions']) - min(x['transitions']))
    reading_times = reading_times['transitions']

    if len(reading_times) == 0:
        return

    # get a sample document to find the correction document
    dir_name = sess.dir_name + os.sep + settings.highlights_dir
    sample_doc_path = dir_name + os.sep + time_to_filename(reading_times[0],
                                                           extension='hocr')
    sample_doc = Document(sample_doc_path, output_dir=None)
    correction_filename = sample_doc.correct_filepath.split(os.sep)[-1]

    # get the word labels
    word_labels = get_word_labels(correction_filename)
    # map the pixel values to words to get the labels
    word_lengths = [len(w) for w in word_labels]
    char_index_to_word_index_mapping = {}
    word_index = 0
    for char_index in range(sum(word_lengths)):
        if sum(word_lengths[:(word_index + 1)]) < char_index:
            word_index += 1
        char_index_to_word_index_mapping[char_index] = word_index

    x_tick_vals = [x for x in plt.xticks()[0] if (x >= 0) and (x <= max_width)]
    x_tick_labels = []
    for x_tick in x_tick_vals:
        word_index = char_index_to_word_index_mapping[x_tick]
        last_index = min([word_index + words_per_label, len(word_labels)])
        x_tick_labels.append(' '.join(word_labels[word_index:last_index]))
    plt.xticks(x_tick_vals, x_tick_labels, rotation=15, fontsize=5)

    # set the y ticks (time)
    y_tick_vals = [
        y for y in plt.yticks()[0] if (y >= 0) and (y <= max_height)
    ]
    num_to_str = lambda num: str(int(num)) if num >= 10 else '0' + str(int(num)
                                                                       )
    to_time_str = lambda t: num_to_str(int(t / 60)) + ':' + num_to_str(t - (
        int(t / 60) * 60))
    plt.yticks(y_tick_vals,
               [to_time_str(yt * settings.little_t) for yt in y_tick_vals])

    if include_labels:
        plt.xlabel('Article Text')
        plt.ylabel('Time')

    # add space for labels and tick marks
    plt.tight_layout()

    if save_and_clear:
        plt.savefig(sess.dir_name + os.sep + settings.highlighting_viz_file,
                    dpi=800)
        plt.clf()
Ejemplo n.º 12
0
def get_highlight_visualization_matrix(sess,
                                       part='digital reading',
                                       redo=False):
    # load from image if not redoing and it exists
    if not redo and os.path.isfile(sess.dir_name + os.sep +
                                   settings.highlighting_image_file):
        return np.array(
            Image.open(sess.dir_name + os.sep +
                       settings.highlighting_image_file))

    # get the largest reading time
    # (in the future it might be good to make multiple visualizations)
    reading_times = max(
        [x for x in sess.metadata if x['part'] == part],
        key=lambda x: max(x['transitions']) - min(x['transitions']))
    reading_times = reading_times['transitions']

    if len(reading_times) == 0:
        return

    # get a sample document to find the correction document
    dir_name = sess.dir_name + os.sep + settings.highlights_dir
    sample_doc_path = dir_name + os.sep + time_to_filename(reading_times[0],
                                                           extension='hocr')
    sample_doc = Document(sample_doc_path, output_dir=None)
    correction_filename = sample_doc.correct_filepath.split(os.sep)[-1]

    # get the lengths of all words
    word_lengths = get_word_lengths(correction_filename)
    total_word_length = sum(word_lengths)

    # get highlighting report
    data = get_user_data(sess)

    # calculate mapping from (time, word) -> color
    min_time = int(min(reading_times) / settings.little_t)
    max_time = int(max(reading_times) / settings.little_t)
    mapping = {}
    for time in data:
        for word_obj in data[time]:
            for w_id in word_obj['id_group']:
                mapping[(int(float(time) * 10), int(w_id))] = word_obj['color']

    # the output image (initalize to zeros)
    # this helps catch bugs as long as black is not one of the highlight colors
    matrix = np.zeros(((max_time - min_time), total_word_length, 3), 'uint8')

    for row in range((max_time - min_time)):
        for word_id in range(len(word_lengths)):
            color_string = get_color(row + min_time, word_id, mapping,
                                     min_time)
            # get the pixel start and end of a word
            word_start = 0 if word_id == 0 else sum(word_lengths[:word_id])
            word_end = word_start + word_lengths[word_id] + 1
            for i in range(3):
                matrix[row, word_start:word_end,
                       i] = settings.highlight_viz_colors[color_string][i]

    # save for future calls
    img = Image.fromarray(matrix).convert('RGB')
    img.save(sess.dir_name + os.sep + settings.highlighting_image_file)

    # return the image
    return matrix
Ejemplo n.º 13
0
def get_num_highlights(filetime):
	filepath = sess.dir_name + os.sep + source_dirs[0] + os.sep + time_to_filename(filetime, extension='hocr')
	doc = Document(filepath, calc_width=False)
	highlights = Counter([w.attrs['highlight'] for l in doc.lines for w in l.children if len(w.text) > 0])
	return highlights