def _score_by_text_frequencies(search, connection, feature, texts, target_units, source_units, features, stoplist, distance_basis, max_distance, tag_helper): source_frequencies_getter = _lookup_wrapper( get_inverse_text_frequencies(connection, feature, texts[0].id)) target_frequencies_getter = _lookup_wrapper( get_inverse_text_frequencies(connection, feature, texts[1].id)) return _score(search, connection, target_units, source_units, features, stoplist, distance_basis, max_distance, source_frequencies_getter, target_frequencies_getter, tag_helper)
def _get_inv_lemmata_freq_getter(conn, freq_basis, text_options, latin_units): if freq_basis != 'texts': return _inverse_averaged_freq_getter( get_corpus_frequencies(conn, 'lemmata', text_options.text.language), latin_units) return _lookup_wrapper( get_inverse_text_frequencies(conn, 'lemmata', text_options.text.id))
def test_mini_text_frequencies(minipop, mini_latin_metadata, mini_greek_metadata, v3checker): all_text_metadata = [ m for m in itertools.chain.from_iterable( [mini_latin_metadata, mini_greek_metadata]) ] title2id = { t.title: t.id for t in minipop.find(Text.collection, title=[m['title'] for m in all_text_metadata]) } for metadata in all_text_metadata: v3freqs = _load_v3_mini_text_stem_freqs(minipop, Text.json_decode(metadata), v3checker) text_id = title2id[metadata['title']] v5freqs = get_inverse_text_frequencies(minipop, 'lemmata', text_id) for form_index, freq in v5freqs.items(): assert form_index in v3freqs assert math.isclose(v3freqs[form_index], 1.0 / freq)
def compute_inverse_frequencies(connection, feature_type, text_id): """Compute inverse text frequencies by specified feature type Parameters ---------- connection : tesserae.db.mongodb.TessMongoConnection feature_type : str Feature category to be used in calculating frequencies text_id : bson.objectid.ObjectId ObjectId of the text whose feature frequencies are to be computed Returns ------- 1d np.array index by form index to obtain corresponding inverse text frequency """ inv_freqs_dict = get_inverse_text_frequencies(connection, feature_type, text_id) inverse_frequencies = np.zeros(max(inv_freqs_dict) + 1) inverse_frequencies[[k for k in inv_freqs_dict.keys()]] = \ [v for v in inv_freqs_dict.values()] return inverse_frequencies