def text_reducer(data_in, **kwargs): '''Reduce a list of text into an alignment table Parameters ---------- data : list A list of strings to be aligned Returns ------- reduction : dict A dictionary with the following keys: * `aligned_text`: A list of lists containing the aligned text. There is one list for each identified word, and each of those lists contains one item for each user that entered text. If the user did not transcribe a word an empty string is used. * `number_views`: Number of volunteers who entered non-blank text * `consensus_score`: The average number of users who's text agreed. Note, if `consensus_score` is the same a `number_views` every user agreed with each other ''' reduction = {} if len(data_in) > 0: user_ids_input = kwargs.pop('user_id') idx, data, gold_standard = zip(*data_in) user_ids = [user_ids_input[i] for i in idx] witness_keys = [] aligned_text = [] collation = col.Collation() for index, text in enumerate(data): key = str(index) witness_keys.append(key) collation.add_plain_witness(key, text) alignment_table = col.collate(collation, near_match=True, segmentation=False) for cols in alignment_table.columns: word_dict = cols.tokens_per_witness word_list = [] for key in witness_keys: word_list.append(str(word_dict.get(key, [''])[0])) aligned_text.append(word_list) consensus_score_value, consensus_text = consensus_score(aligned_text) reduction = { 'aligned_text': aligned_text, 'number_views': len(data), 'consensus_score': consensus_score_value, 'consensus_text': consensus_text, 'gold_standard': list(gold_standard), 'user_ids': user_ids } return reduction
def frd_collate(col_obj): df = chunks_to_df(col_obj.files()) counter = 0 for gr in df.groupby('chunk_nr'): counter += 1 col_sample_id = f"{col_obj.hashes()}__{counter:03}" collation = collatex.Collation() cur_df = gr[1] for i, row in cur_df.iterrows(): print(row['id']) collation.add_plain_witness(row['id'], row['text']) table = collatex.collate(collation) col_sample, _ = FrdCollationSample.objects.get_or_create( title_slug=col_sample_id, parent_col=col_obj) data = visualize_table_vertically_with_colors(table, collation) data_tei = export_alignment_table_as_tei(table, collation) col_sample.data_html = data col_sample.data_tei = data_tei col_sample.save() return col_obj
def align_words(word_line, xy_line, text_line, kwargs_cluster, kwargs_dbscan): '''A function to take the annotations for one line of text, aligns the words, and finds the end-points for the line. Parameters ---------- word_line : np.array An nx1 array with the x-position of each dot in the rotated coordiate frame. xy_line : np.array An nx2 array with the non-rotated (x, y) positions of each dot. text_line : np.array An nx1 array with the text for each dot. gs_line : np.array An array of bools indicating if the annotation was made in gold standard mode kwargs_cluster : dict A dictionary containing the `eps_*` and `dot_freq` keywords kwargs_dbscan : dict A dictionary containing all the other DBSCAN keywords Returns ------- clusters_x : list A list with the start and end x-position of the line clusters_y : list A list with the start and end y-position of the line clusters_text : list A list-of-lists with the words transcribed at each dot cluster found. One list per cluster. Note: the empty strings that were added to each annotaiton are stripped before returning the words. ''' clusters_x = [] clusters_y = [] clusters_text = [] # ignore min_samples when trying to find the end points of a line min_samples = kwargs_dbscan.pop('min_samples', 1) db_words = DBSCAN(eps=kwargs_cluster['eps_word'], min_samples=1, **kwargs_dbscan).fit(word_line) # put min_samples back in kwargs_dbscan['min_samples'] = min_samples word_labels = sort_labels(db_words.labels_, word_line) if len(word_labels) > 1: word_labels = [word_labels[0], word_labels[-1]] for word_label in word_labels: wdx = db_words.labels_ == word_label word_x, word_y = xy_line[wdx].mean(axis=0) clusters_x.append(float(word_x)) clusters_y.append(float(word_y)) collation = col.Collation() witness_key = [] for tdx, t in enumerate(text_line): if t.strip() != '': key = str(tdx) collation.add_plain_witness(key, t) witness_key.append(key) if len(collation.witnesses) > 0: alignment_table = col.collate(collation, near_match=True, segmentation=False) for cols in alignment_table.columns: word_dict = cols.tokens_per_witness word_list = [] for key in witness_key: if len(word_dict) >= kwargs_cluster['min_word_count']: word_list.append(str(word_dict.get(key, [''])[0])) else: word_list.append('') clusters_text.append(word_list) # fix memory leak by deleting this del alignment_table return clusters_x, clusters_y, clusters_text
def optics_line_text_reducer(data_by_frame, **kwargs_optics): '''Reduce the line-text extracts as a list of lines of text. Parameters ---------- data_by_frame : dict A dictionary returned by :meth:`process_data` kwargs : * `See OPTICS <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html>`_ * `min_samples` : The smallest number of transcribed lines needed to form a cluster. `auto` will set this value based on the number of volunteers who transcribed on a page within a subject. * `xi` : Determines the minimum steepness on the reachability plot that constitutes a cluster boundary. * `angle_eps` : How close the angle of two lines need to be in order to be placed in the same angle cluster. Note: This will only change the order of the lines. * `gutter_eps` : How close the `x` position of the start of two lines need to be in order to be placed in the same column cluster. Note: This will only change the order of the lines. * `min_line_length` : The minimum length a transcribed line of text needs to be in order to be used in the reduction. * `low_consensus_threshold` : The minimum consensus score allowed to be considered "done". * `minimum_views` : A value that is passed along to the font-end to set when lines should turn grey (has no effect on aggregation) Returns ------- reduction : dict A dictionary with on key for each `frame` of the subject that have lists as values. Each item of the list represents one line transcribed of text and is a dictionary with these keys: * `clusters_x` : the `x` position of each identified word * `clusters_y` : the `y` position of each identified word * `clusters_text` : A list of lists containing the text at each cluster position There is one list for each identified word, and each of those lists contains one item for each user that identified the cluster. If the user did not transcribe the word an empty string is used. * `line_slope`: The slope of the line of text in degrees * `number_views` : The number of users that transcribed the line of text * `consensus_score` : The average number of users who's text agreed for the line Note, if `consensus_score` is the same a `number_views` every user agreed with each other * `user_ids`: List of panoptes user ids in the same order as `clusters_text` * `gold_standard`: List of bools indicating of the if a transcription was made in frontends gold standard mode * `slope_label`: integer indicating what slope cluster the line belongs to * `gutter_label`: integer indicating what gutter cluster (i.e. column) the line belongs to * `low_consensus` : True if the `consensus_score` is less than the threshold set by the `low_consensus_threshold` keyword For the entire subject the following is also returned: * `low_consensus_lines` : The number of lines with low consensus * `transcribed_lines` : The total number of lines transcribed on the subject Note: the image coordinate system has y increasing downward. ''' user_ids_input = np.array(kwargs_optics.pop('user_id')) low_consensus_threshold = kwargs_optics.pop('low_consensus_threshold') _ = kwargs_optics.pop('minimum_views') output = defaultdict(list) min_samples_orig = kwargs_optics.pop('min_samples') angle_eps = kwargs_optics.pop('angle_eps') gutter_eps = kwargs_optics.pop('gutter_eps') max_eps = kwargs_optics.pop('max_eps', np.inf) if max_eps is None: max_eps = np.inf low_consensus_lines = 0 number_of_lines = 0 for frame, value in data_by_frame.items(): frame_unordered = [] X = np.array(value['X']) data = np.array(value['data']) if X.size > 0: num_users = len(np.unique(X[:, 1])) ext_index = np.array(extractor_index(X[:, 1])) else: num_users = 0 ext_index = np.array([]) if min_samples_orig == 'auto': min_samples = get_min_samples(num_users) else: min_samples = max(2, min_samples_orig) if num_users >= min_samples: db = OPTICS(metric=metric, metric_params={'data_in': data}, min_samples=min_samples, **kwargs_optics) with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=RuntimeWarning) db.fit(X) clean_labels = remove_user_duplication(db.labels_, db.core_distances_, X[:, 1]) for label in np.unique(clean_labels): cdx = clean_labels == label if label == -1: # noise values are assigned to clusters of one frame_unordered += cluster_of_one(X[cdx], data, user_ids_input, ext_index[cdx].tolist()) else: xs = [data[int(i)]['x'] for i in X[cdx, 0]] ys = [data[int(i)]['y'] for i in X[cdx, 0]] xm = np.median(xs, axis=0) ym = np.median(ys, axis=0) slope = np.rad2deg( np.arctan2(ym[-1] - ym[0], xm[-1] - xm[0])) collation = col.Collation() witness_keys = [] clusters_text = [] user_ids = [] gold_standard = [] for row in X[cdx]: index = int(row[0]) user_index = int(row[1]) text = data[index]['text'][0] gs = data[index]['gold_standard'] if text.strip() != '': key = str(index) witness_keys.append(key) user_ids.append(user_ids_input[user_index]) gold_standard.append(gs) collation.add_plain_witness(key, text) if len(collation.witnesses) > 0: alignment_table = col.collate(collation, near_match=True, segmentation=False) for cols in alignment_table.columns: word_dict = cols.tokens_per_witness word_list = [] for key in witness_keys: word_list.append( str(word_dict.get(key, [''])[0])) clusters_text.append(word_list) consensus_score_value, consensus_text = consensus_score( clusters_text) low_consensus = consensus_score_value < low_consensus_threshold if low_consensus: low_consensus_lines += 1 value = { 'clusters_x': xm.tolist(), 'clusters_y': ym.tolist(), 'clusters_text': clusters_text, 'number_views': cdx.sum(), 'line_slope': slope, 'consensus_score': consensus_score_value, 'consensus_text': consensus_text, 'user_ids': user_ids, 'extract_index': ext_index[cdx].tolist(), 'gold_standard': gold_standard, 'low_consensus': low_consensus, 'flagged': low_consensus } number_of_lines += 1 frame_unordered.append(value) else: # not enough data to cluster so assign each extract # to its own cluster frame_unordered += cluster_of_one(X, data, user_ids_input, ext_index.tolist()) if len(frame_unordered) > 0: low_consensus_lines += 1 number_of_lines += 1 output[frame] = order_lines(frame_unordered, angle_eps=angle_eps, gutter_eps=gutter_eps) output['low_consensus_lines'] = low_consensus_lines output['transcribed_lines'] = number_of_lines output['reducer'] = 'optics_line_text_reducer' return dict(output)