Ejemplos de get_documents en Python, ejemplos de ocr_cleanup.get_documents en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: document_aggregation.py Proyecto: kbrady/tess

def assign_global_ids_from_correct_file(sess,
                                        part='digital reading',
                                        redo=False,
                                        error_dir=settings.error_dir,
                                        source_dir_name=settings.xml_dir,
                                        alt_dir_name=settings.global_id_dir,
                                        edit_dir=None,
                                        start_from_scratch=False):
    documents = get_documents(sess,
                              redo=True,
                              source_dir_name=source_dir_name,
                              alt_dir_name=alt_dir_name,
                              edit_dir=edit_dir)
    if len(documents) == 0:
        return
    mapping = get_word_to_id_assignment(settings.correct_text_dir + os.sep +
                                        documents[0].attrs['filename'])
    max_possible_value = max(
        [g_id for id_list in mapping.values() for g_id in id_list])
    for doc in documents:
        if not redo:
            if os.path.isfile(doc.output_file):
                continue
        assign_global_ids_to_doc(doc,
                                 mapping,
                                 max_possible_value,
                                 error_dir,
                                 start_from_scratch=start_from_scratch)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: find_highlights.py Proyecto: kbrady/tess

def find_all_highlights(sess,
                        part='digital reading',
                        img_dir_path=settings.frame_images_dir,
                        no_bolding=False,
                        dir_to_read_from=settings.global_id_dir,
                        dir_to_save_to=settings.highlights_dir,
                        recalculate=False):
    # get documents
    documents = get_documents(sess,
                              redo=recalculate,
                              alt_dir_name=dir_to_save_to,
                              source_dir_name=dir_to_read_from,
                              part=part)
    # get the right path to images
    img_dir_path = sess.dir_name + os.sep + img_dir_path
    # fing highlights for each document
    for doc in documents:
        find_highlights(doc, img_dir_path, no_bolding=no_bolding)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: find_scrolls.py Proyecto: kbrady/tess

def stitch_lines(sess, part='digital reading', save=True, redo=False):
	# if already calculated, load (as long as not redoing)
	if not redo and os.path.isfile(sess.dir_name + os.sep + settings.stitched_together_json_file):
		with open(sess.dir_name + os.sep + settings.stitched_together_json_file, 'r') as fp:
			return json.load(fp)
	lines_dict = defaultdict(list)
	# for each line put the pair of the line and the previous line in the lines_dict
	documents = get_documents(sess, redo=True, alt_dir_name=None, part=part, source_dir_name=settings.xml_dir)
	if len(documents) == 0:
		return []
	lines_in_order = get_lines(documents[0])
	for doc in documents:
		prev_line = None
		prev_key = None
		for line in doc.lines:
			line_key = line.attrs['updated_line']
			if line_key == '':
				continue
			if keys_math(line_key, prev_key, lines_in_order):
				lines_dict[line_key].append((line, prev_line))
			prev_line = line
			prev_key = line_key
	# go through the lines and build a list of line positions as output
	output = []
	prev_line_key = None
	prev_height = None
	prev_top = 0
	for next_line_key in lines_in_order:
		next_line = build_line_info(lines_dict[next_line_key], next_line_key, prev_height, prev_top)
		output.append(next_line)
		prev_line_key = next_line_key
		prev_height = next_line[2] - next_line[1]
		prev_top = next_line[1]
	# save so we don't need to calculate this again
	if save:
		with open(sess.dir_name + os.sep + settings.stitched_together_json_file, 'w') as fp:
			json.dump(output, fp)
	return output

Ejemplo n.º 4

0

Mostrar archivo

Archivo: find_scrolls.py Proyecto: kbrady/tess

def calculate_scrolling(sess, part='digital reading', redo=False, save=True, include_low_confidence=False):
	# load if already calculated
	if redo is False:
		filepath = sess.dir_name + os.sep + settings.scrolling_csv_file
		if os.path.isfile(filepath):
			return pd.read_csv(filepath)
	# get the lines as a single list
	static_line_list = stitch_lines(sess, part=part)
	# get the list of documents
	documents = get_documents(sess, redo=True, alt_dir_name=None, part=part, source_dir_name=settings.xml_dir)
	# store all the output as rows of time, top_of_frame, bottom_of_frame
	top_value = None
	bottom_value = None
	rows = []
	# retrieve the time and top of page values for each document
	for doc in documents:
		t = doc.time
		# get dimensions of an example frame
		if top_value is None:
			frame_filepath = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + time_to_filename(t)
			img = np.array(misc.imread(frame_filepath))
			# not unscalling properly right now
			bottom_value = img.shape[0]
			top_value = 0
		mapping_function = get_mapping_function(sess, doc, static_line_list, include_low_confidence=include_low_confidence)
		y_top = mapping_function(top_value)
		y_bottom = mapping_function(bottom_value)
		# filter out unlikely scrolls (those with less than 4 lines recognized)
		# these times will be replaced with the times before them
		# (they are most likely to be times when the document is outside the corpus or the screen is very zoomed)
		if y_top is None:
			continue
		rows.append((t, y_top, y_bottom))
	df = pd.DataFrame(rows, columns=['Time', 'Top', 'Bottom'])
	if save:
		df.to_csv(sess.dir_name + os.sep + settings.scrolling_csv_file, index=False)
	return df

Ejemplo n.º 5

0

Mostrar archivo

Archivo: find_highlights.py Proyecto: kbrady/tess

def make_report(sess,
                part='digital reading',
                use_edit_dir=True,
                default_color='white'):
    # get the documents for this session
    if use_edit_dir:
        documents = get_documents(sess,
                                  redo=True,
                                  alt_dir_name=settings.highlights_dir,
                                  source_dir_name=settings.highlights_dir,
                                  part=part,
                                  edit_dir=settings.editor_dir)
    else:
        documents = get_documents(sess,
                                  redo=True,
                                  alt_dir_name=settings.highlights_dir,
                                  source_dir_name=settings.highlights_dir,
                                  part=part)
    times_and_documents = [(filename_to_time(doc.input_file), doc)
                           for doc in documents]
    times_and_documents.sort()
    report = defaultdict(list)
    # keep track of the current state of each word
    # assume the starting color is white
    current_state = defaultdict(lambda: default_color)
    for doc_time, doc in times_and_documents:
        new_highlight_ids = {}
        words = [
            w for l in doc.lines for w in l.children if 'highlight' in w.attrs
        ]
        # check if most the words are not white, and throw those frames out
        frac_non_default = len([
            w for w in words if w.attrs['highlight'] != default_color
        ]) / len(words)
        if frac_non_default > .6:
            continue
        for w in words:
            # if there is no global id, skip
            if 'global_ids' not in w.attrs:
                continue
            # treat each global id as different
            id_group = [int(x) for x in w.attrs['global_ids'].split(' ')]
            # highlights only count if the color changed for all words
            changed = True
            for global_id in id_group:
                # if any word in the id group was already this color, no highlight is measured
                if w.attrs['highlight'] == current_state[global_id]:
                    changed = False
            # this is the case where we record stuff
            if changed:
                # record for each id that is changing it's value
                for global_id in id_group:
                    if w.attrs['highlight'] != current_state[global_id]:
                        report[doc_time].append({
                            'id':
                            global_id,
                            'text':
                            str(w),
                            'id_group':
                            id_group,
                            'color':
                            w.attrs['highlight'],
                            'former colors': [
                                current_state[global_id]
                                for global_id in id_group
                            ]
                        })
                for global_id in id_group:
                    current_state[global_id] = w.attrs['highlight']
    return report