Ejemplo n.º 1
0
def text_reducer(data_in, **kwargs):
    '''Reduce a list of text into an alignment table
    Parameters
    ----------
    data : list
        A list of strings to be aligned

    Returns
    -------
    reduction : dict
        A dictionary with the following keys:

        *   `aligned_text`: A list of lists containing the aligned text.
            There is one list for each identified word, and each of those lists contains
            one item for each user that entered text. If the user did not transcribe
            a word an empty string is used.
        *   `number_views`: Number of volunteers who entered non-blank text
        *   `consensus_score`: The average number of users who's text agreed.
            Note, if `consensus_score` is the same a `number_views` every user agreed with each other
    '''
    reduction = {}
    if len(data_in) > 0:
        user_ids_input = kwargs.pop('user_id')
        idx, data, gold_standard = zip(*data_in)
        user_ids = [user_ids_input[i] for i in idx]
        witness_keys = []
        aligned_text = []
        collation = col.Collation()
        for index, text in enumerate(data):
            key = str(index)
            witness_keys.append(key)
            collation.add_plain_witness(key, text)
        alignment_table = col.collate(collation,
                                      near_match=True,
                                      segmentation=False)
        for cols in alignment_table.columns:
            word_dict = cols.tokens_per_witness
            word_list = []
            for key in witness_keys:
                word_list.append(str(word_dict.get(key, [''])[0]))
            aligned_text.append(word_list)
        consensus_score_value, consensus_text = consensus_score(aligned_text)
        reduction = {
            'aligned_text': aligned_text,
            'number_views': len(data),
            'consensus_score': consensus_score_value,
            'consensus_text': consensus_text,
            'gold_standard': list(gold_standard),
            'user_ids': user_ids
        }
    return reduction
Ejemplo n.º 2
0
def frd_collate(col_obj):
    df = chunks_to_df(col_obj.files())
    counter = 0
    for gr in df.groupby('chunk_nr'):
        counter += 1
        col_sample_id = f"{col_obj.hashes()}__{counter:03}"
        collation = collatex.Collation()
        cur_df = gr[1]
        for i, row in cur_df.iterrows():
            print(row['id'])
            collation.add_plain_witness(row['id'], row['text'])
        table = collatex.collate(collation)
        col_sample, _ = FrdCollationSample.objects.get_or_create(
            title_slug=col_sample_id, parent_col=col_obj)
        data = visualize_table_vertically_with_colors(table, collation)
        data_tei = export_alignment_table_as_tei(table, collation)
        col_sample.data_html = data
        col_sample.data_tei = data_tei
        col_sample.save()

    return col_obj
Ejemplo n.º 3
0
def align_words(word_line, xy_line, text_line, kwargs_cluster, kwargs_dbscan):
    '''A function to take the annotations for one line of text, aligns the words,
    and finds the end-points for the line.

    Parameters
    ----------
    word_line : np.array
        An nx1 array with the x-position of each dot in the rotated coordiate frame.
    xy_line : np.array
        An nx2 array with the non-rotated (x, y) positions of each dot.
    text_line : np.array
        An nx1 array with the text for each dot.
    gs_line : np.array
        An array of bools indicating if the annotation was made in gold standard mode
    kwargs_cluster : dict
        A dictionary containing the `eps_*` and `dot_freq` keywords
    kwargs_dbscan : dict
        A dictionary containing all the other DBSCAN keywords

    Returns
    -------
    clusters_x : list
        A list with the start and end x-position of the line
    clusters_y : list
        A list with the start and end y-position of the line
    clusters_text : list
        A list-of-lists with the words transcribed at each dot cluster found. One
        list per cluster. Note: the empty strings that were added to each annotaiton are
        stripped before returning the words.
    '''
    clusters_x = []
    clusters_y = []
    clusters_text = []
    # ignore min_samples when trying to find the end points of a line
    min_samples = kwargs_dbscan.pop('min_samples', 1)
    db_words = DBSCAN(eps=kwargs_cluster['eps_word'],
                      min_samples=1,
                      **kwargs_dbscan).fit(word_line)
    # put min_samples back in
    kwargs_dbscan['min_samples'] = min_samples
    word_labels = sort_labels(db_words.labels_, word_line)
    if len(word_labels) > 1:
        word_labels = [word_labels[0], word_labels[-1]]
        for word_label in word_labels:
            wdx = db_words.labels_ == word_label
            word_x, word_y = xy_line[wdx].mean(axis=0)
            clusters_x.append(float(word_x))
            clusters_y.append(float(word_y))
        collation = col.Collation()
        witness_key = []
        for tdx, t in enumerate(text_line):
            if t.strip() != '':
                key = str(tdx)
                collation.add_plain_witness(key, t)
                witness_key.append(key)
        if len(collation.witnesses) > 0:
            alignment_table = col.collate(collation,
                                          near_match=True,
                                          segmentation=False)
            for cols in alignment_table.columns:
                word_dict = cols.tokens_per_witness
                word_list = []
                for key in witness_key:
                    if len(word_dict) >= kwargs_cluster['min_word_count']:
                        word_list.append(str(word_dict.get(key, [''])[0]))
                    else:
                        word_list.append('')
                clusters_text.append(word_list)
            # fix memory leak by deleting this
            del alignment_table
    return clusters_x, clusters_y, clusters_text
def optics_line_text_reducer(data_by_frame, **kwargs_optics):
    '''Reduce the line-text extracts as a list of lines of text.

    Parameters
    ----------
    data_by_frame : dict
        A dictionary returned by :meth:`process_data`
    kwargs :
        * `See OPTICS <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html>`_
        * `min_samples` : The smallest number of transcribed lines needed to form a cluster.
          `auto` will set this value based on the number of volunteers who transcribed on a page within a subject.
        * `xi` : Determines the minimum steepness on the reachability plot that constitutes a cluster boundary.
        * `angle_eps` : How close the angle of two lines need to be in order to be placed in the same angle cluster.
          Note: This will only change the order of the lines.
        * `gutter_eps` : How close the `x` position of the start of two lines need to be in order to be placed in the same column cluster.
          Note: This will only change the order of the lines.
        * `min_line_length` : The minimum length a transcribed line of text needs to be in order to be used in the reduction.
        * `low_consensus_threshold` : The minimum consensus score allowed to be considered "done".
        * `minimum_views` : A value that is passed along to the font-end to set when lines should turn grey (has no effect on aggregation)

    Returns
    -------
    reduction : dict
        A dictionary with on key for each `frame` of the subject that have lists as values.
        Each item of the list represents one line transcribed of text and is a dictionary
        with these keys:

        * `clusters_x` : the `x` position of each identified word
        * `clusters_y` : the `y` position of each identified word
        * `clusters_text` : A list of lists containing the text at each cluster position
          There is one list for each identified word, and each of those lists contains
          one item for each user that identified the cluster. If the user did not transcribe
          the word an empty string is used.
        * `line_slope`: The slope of the line of text in degrees
        * `number_views` : The number of users that transcribed the line of text
        * `consensus_score` : The average number of users who's text agreed for the line
          Note, if `consensus_score` is the same a `number_views` every user agreed with each other
        * `user_ids`: List of panoptes user ids in the same order as `clusters_text`
        * `gold_standard`: List of bools indicating of the if a transcription was made in frontends
          gold standard mode
        * `slope_label`: integer indicating what slope cluster the line belongs to
        * `gutter_label`: integer indicating what gutter cluster (i.e. column) the line belongs to
        * `low_consensus` : True if the `consensus_score` is less than the threshold set by the
          `low_consensus_threshold` keyword

        For the entire subject the following is also returned:
        * `low_consensus_lines` : The number of lines with low consensus
        * `transcribed_lines` : The total number of lines transcribed on the subject

        Note: the image coordinate system has y increasing downward.
    '''
    user_ids_input = np.array(kwargs_optics.pop('user_id'))
    low_consensus_threshold = kwargs_optics.pop('low_consensus_threshold')
    _ = kwargs_optics.pop('minimum_views')
    output = defaultdict(list)
    min_samples_orig = kwargs_optics.pop('min_samples')
    angle_eps = kwargs_optics.pop('angle_eps')
    gutter_eps = kwargs_optics.pop('gutter_eps')
    max_eps = kwargs_optics.pop('max_eps', np.inf)
    if max_eps is None:
        max_eps = np.inf
    low_consensus_lines = 0
    number_of_lines = 0
    for frame, value in data_by_frame.items():
        frame_unordered = []
        X = np.array(value['X'])
        data = np.array(value['data'])
        if X.size > 0:
            num_users = len(np.unique(X[:, 1]))
            ext_index = np.array(extractor_index(X[:, 1]))
        else:
            num_users = 0
            ext_index = np.array([])
        if min_samples_orig == 'auto':
            min_samples = get_min_samples(num_users)
        else:
            min_samples = max(2, min_samples_orig)
        if num_users >= min_samples:
            db = OPTICS(metric=metric,
                        metric_params={'data_in': data},
                        min_samples=min_samples,
                        **kwargs_optics)
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore', category=RuntimeWarning)
                db.fit(X)
            clean_labels = remove_user_duplication(db.labels_,
                                                   db.core_distances_, X[:, 1])
            for label in np.unique(clean_labels):
                cdx = clean_labels == label
                if label == -1:
                    # noise values are assigned to clusters of one
                    frame_unordered += cluster_of_one(X[cdx], data,
                                                      user_ids_input,
                                                      ext_index[cdx].tolist())
                else:
                    xs = [data[int(i)]['x'] for i in X[cdx, 0]]
                    ys = [data[int(i)]['y'] for i in X[cdx, 0]]
                    xm = np.median(xs, axis=0)
                    ym = np.median(ys, axis=0)
                    slope = np.rad2deg(
                        np.arctan2(ym[-1] - ym[0], xm[-1] - xm[0]))
                    collation = col.Collation()
                    witness_keys = []
                    clusters_text = []
                    user_ids = []
                    gold_standard = []
                    for row in X[cdx]:
                        index = int(row[0])
                        user_index = int(row[1])
                        text = data[index]['text'][0]
                        gs = data[index]['gold_standard']
                        if text.strip() != '':
                            key = str(index)
                            witness_keys.append(key)
                            user_ids.append(user_ids_input[user_index])
                            gold_standard.append(gs)
                            collation.add_plain_witness(key, text)
                    if len(collation.witnesses) > 0:
                        alignment_table = col.collate(collation,
                                                      near_match=True,
                                                      segmentation=False)
                        for cols in alignment_table.columns:
                            word_dict = cols.tokens_per_witness
                            word_list = []
                            for key in witness_keys:
                                word_list.append(
                                    str(word_dict.get(key, [''])[0]))
                            clusters_text.append(word_list)
                    consensus_score_value, consensus_text = consensus_score(
                        clusters_text)
                    low_consensus = consensus_score_value < low_consensus_threshold
                    if low_consensus:
                        low_consensus_lines += 1
                    value = {
                        'clusters_x': xm.tolist(),
                        'clusters_y': ym.tolist(),
                        'clusters_text': clusters_text,
                        'number_views': cdx.sum(),
                        'line_slope': slope,
                        'consensus_score': consensus_score_value,
                        'consensus_text': consensus_text,
                        'user_ids': user_ids,
                        'extract_index': ext_index[cdx].tolist(),
                        'gold_standard': gold_standard,
                        'low_consensus': low_consensus,
                        'flagged': low_consensus
                    }
                    number_of_lines += 1
                    frame_unordered.append(value)
        else:
            # not enough data to cluster so assign each extract
            # to its own cluster
            frame_unordered += cluster_of_one(X, data, user_ids_input,
                                              ext_index.tolist())
            if len(frame_unordered) > 0:
                low_consensus_lines += 1
                number_of_lines += 1
        output[frame] = order_lines(frame_unordered,
                                    angle_eps=angle_eps,
                                    gutter_eps=gutter_eps)
        output['low_consensus_lines'] = low_consensus_lines
        output['transcribed_lines'] = number_of_lines
        output['reducer'] = 'optics_line_text_reducer'
    return dict(output)