예제 #1
0
def lexical_diversity_mtld(
    doc: Doc, model_name: str = "spacy", ttr_segment: float = 0.72
) -> float:
    """Compute MTLD lexical diversity in a bi-directional fashion.

    :param doc: Processed text
    :type doc: NLP Doc
    :param model_name: Determines which model is used (spacy or stanza)
    :type model_name: str
    :param ttr_segment: Threshold for TTR mean computation
    :type ttr_segment: float
    :return: Bi-directional lexical diversity MTLD
    :rtype: float
    """
    # check model
    model = SupportedModels(model_name)

    word_list = []
    if model == SupportedModels.SPACY:
        for token in doc:
            if is_word(token.pos_):
                word_list.append(token.lemma_)
    elif model == SupportedModels.STANZA:
        for sent in doc.sentences:
            for word in sent.words:
                if is_word(word.upos):
                    word_list.append(word.lemma)
    return (
        one_side_lexical_diversity_mtld(word_list, model, ttr_segment)
        + one_side_lexical_diversity_mtld(word_list[::-1], model, ttr_segment)
    ) / 2
예제 #2
0
def frequency_index(doc, frequency_dict):
    """Return frequency index.

    The frequency index is defined as the average frequency of the rarest
    word over sentences. To compute this, we use a dictionary. In the case
    of this Spanish implementation we could use RAE dictionary CREA.

    :param doc: Tokenized text.
    :type doc: Spacy Doc
    :return: Frequency index
    :rtype: float
    """
    n_sents = 0
    aggregate_frec = 0
    for sent in doc.sents:
        minimum = 99999999999999
        for token in sent:
            if is_word(token.pos_):
                frec = frequency_dict.get(token.lower_, 0)
                if frec < minimum and frec > 0:
                    minimum = frec
        if minimum > 0:
            aggregate_frec += log(minimum, 10)
            n_sents += 1
    return aggregate_frec / n_sents
예제 #3
0
def word_variation_index(doc: Doc) -> float:
    r"""Compute Word Variation Index.

    Word variation index might be thought as the density
    of ideas in a text. It is computed as:

    .. math::
        WVI = \displaystyle\frac{log\left(n(w)\right)}
        {log\left(2 - \frac{log(n(vw))}{log(n(w))}\right)}

    Where `n(w)` is the number of words in the text, and `n(vw)` is
    the number of unique words in the text.

    :param doc: Document to be processed
    :type doc: Doc
    :return: Word variation index
    :rtype: float
    """
    token_list: List[str] = []
    for token in doc:
        if is_word(token):
            token_list.append(token.lemma_)

    number_of_words = len(token_list)
    number_of_types = len(set(token_list))
    return np.log(number_of_words) / np.log(2 - np.log(number_of_types) /
                                            np.log(number_of_words))
예제 #4
0
def yule_k(doc: Doc) -> float:
    r"""Compute Yule's K from a text.

    Yule's K is defined as follows :cite:`yule2014statistical`:

    .. math::
        K=10^{4}\displaystyle\frac{\sum{r^2V_r-N}}{N^2}

    Where `Vr` is the number of tokens ocurring `r` times.
    This is a measurement of lexical diversity.

    :param doc: Processed spaCy Doc
    :type doc: Doc
    :return: Texts' Yule's K
    :rtype: float
    """
    counts: Dict[str, int] = defaultdict(int)
    N: int = 0
    for token in doc:
        if is_word(token):
            counts[token.lemma_] += 1
            N += 1

    rs: Dict[int, int] = defaultdict(int)
    for key, value in counts.items():
        rs[value] += 1

    return 1e4 * sum(r**2 * vr - N for r, vr in rs.items()) / N**2
예제 #5
0
def d_estimate(doc: Doc,
               min_range: int = 35,
               max_range: int = 50,
               trials: int = 5) -> float:
    r"""Compute D measurement for lexical diversity.

    The measurement is based in :cite:`richards2000measuring`. We pick ``n``
    numbers of tokens, varying ``N`` from ``min_range`` up to ``max_range``.
    For each ``n`` we do the following:

    1. Sample ``n`` tokens without replacement
    2. Compute ``TTR``
    3. Repeat steps 1 and 2 ``trials`` times
    4. Compute the average ``TTR``

    At this point, we have a set of points ``(n, ttr)``. We then fit
    these observations to the following model:

    .. math::
        TTR = \displaystyle\frac{D}{N}\left[\sqrt{1 + 2\frac{N}{D}} - 1\right]

    The fit is done to get an estimation for the ``D`` parameter, and we use
    a least squares as the criteria for the fit.

    :param doc: SpaCy doc of the text.
    :type doc: Doc
    :param min_range: Lower bound for n, defaults to 35
    :type min_range: int, optional
    :param max_range: Upper bound for n, defaults to 50
    :type max_range: int, optional
    :param trials: Number of trials to estimate TTR, defaults to 5
    :type trials: int, optional
    :raises ValueError: If invalid range is provided.
    :return: D metric
    :rtype: float
    """
    if min_range >= max_range:
        raise ValueError("max_range should be greater than min_range"
                         f"you provided [{min_range}, {max_range}]")
    token_list: List[str] = []
    for token in doc:
        if is_word(token):
            token_list.append(token.lemma_)

    ns = np.arange(min_range, max_range + 1)
    ttrs = np.zeros(len(ns))
    for idx, sample_size in enumerate(ns):
        ttr = 0
        for trial in range(trials):
            word_list = np.random.choice(token_list,
                                         sample_size,
                                         replace=False)
            ttr += type_token_ratio(word_list)
        ttrs[idx] = ttr / trials
    A = np.vstack([2 * (1 - ttrs) / ns]).T
    y = ttrs**2
    d = np.linalg.lstsq(A, y, rcond=None)[0]
    return d[0]
예제 #6
0
def word_count(doc):
    """Return number of words in a text.

    :param doc: Text to be processed.
    :type doc: Spacy Doc
    :return: Word count
    :rtype: int
    """
    return sum([1 for token in doc if is_word(token.pos_)])
예제 #7
0
def test_pos_booleans():
    """Test POS boolean methods."""
    assert utils.is_adjective("ADJ")
    assert utils.is_adverb("ADV")
    assert utils.is_noun("NOUN")
    assert utils.is_noun("PROPN")
    assert utils.is_pronoun("PRON")
    assert utils.is_verb("VERB")
    assert utils.is_word("NOUN")
예제 #8
0
def lexical_diversity_mtld(doc, ttr_segment=0.72):
    """Compute MTLD lexical diversity in a bi-directional fashion.

    :param doc: Processed text
    :type doc: Spacy Doc
    :return: Bi-directional lexical diversity MTLD
    :rtype: float
    """
    word_list = []
    for token in doc:
        if is_word(token.pos_):
            word_list.append(token.lemma_)
    return (one_side_lexical_diversity_mtld(word_list, ttr_segment) +
            one_side_lexical_diversity_mtld(word_list[::-1], ttr_segment)) / 2
예제 #9
0
def lexical_diversity_mtld(doc, model_name="spacy", ttr_segment=0.72):
    """Compute MTLD lexical diversity in a bi-directional fashion.

    :param doc: Processed text
    :type doc: NLP Doc
    :return: Bi-directional lexical diversity MTLD
    :rtype: float
    """
    # check model
    model = SupportedModels(model_name)

    word_list = []
    if model == SupportedModels.SPACY:
        for token in doc:
            if is_word(token.pos_):
                word_list.append(token.lemma_)
    elif model == SupportedModels.STANZA:
        for sent in doc.sentences:
            for word in sent.words:
                if is_word(word.upos):
                    word_list.append(word.lemma)
    return (one_side_lexical_diversity_mtld(word_list, model, ttr_segment) +
            one_side_lexical_diversity_mtld(word_list[::-1], model,
                                            ttr_segment)) / 2
예제 #10
0
def connection_words_ratio(doc):
    """Get ratio of connecting words over total words of text.

    This function computes the ratio of connective words over the total
    number of words. This implementation is only supported in Spanish and
    we consider the following lemmas: ``y``, ``o``, ``no``, ``si``.

    :param doc: Tokenized text
    :type doc: Spacy Doc
    :return: Connection word ratio
    :rtype: float
    """
    return sum([
        1 for token in doc if token.lemma_.lower() in {"y", "o", "no", "si"}
        and is_word(token.pos_)
    ]) / word_count(doc)
예제 #11
0
def negation_density(doc):
    """Compute negation density.

    This is defined as the ratio between number of occurrences of
    ``TRUNAJOD.surface_proxies.NEGATION_WORDS`` in the text over the
    total word count.

    :param doc: Tokenized text
    :type doc: Spacy Doc
    :return: Negation density
    :rtype: float
    """
    negation_count = 0
    for token in doc:
        if is_word(token.pos_) and token.lemma_.lower() in NEGATION_WORDS:
            negation_count += 1

    return negation_count / word_count(doc)
예제 #12
0
def pos_ratio(doc, pos_types):
    """Compute POS ratio given desired type of ratio.

    The ``pos_types`` might be a regular expression if a composed ratio
    is needed. An example of usage would be ``pos_ratio(doc, "VERB|AUX")``.

    :param doc: Spacy processed text
    :type doc: Spacy Doc
    :param pos_types: POS to get the ratio
    :type pos_types: string
    :return: Ratio over number of words
    :rtype: float
    """
    pos_regex = re.compile(pos_types)
    total_words = 0
    total_pos_tags = 0
    for token in doc:
        if is_word(token.pos_):
            total_words += 1
            if pos_regex.search(token.tag_):
                total_pos_tags += 1
    return total_pos_tags / total_words
예제 #13
0
def pronoun_density(doc):
    """Compute pronoun density.

    This is a measurement of text complexity, in the sense that a text
    with a higher pronoun density will be more difficult to read than
    a text with lower pronoun density (due to inferences needed). The
    way this is computed is taking the ratio between third person
    pronouns and total words in the text.

    :param doc: Document to be processed.
    :type doc: Spacy Doc
    :return: Pronoun density
    :rtype: float
    """
    word_counter = 0
    third_person_pronouns = 0
    for token in doc:
        if is_word(token.pos_):
            word_counter += 1
            if is_pronoun(token.pos_) and THIRD_PERSON_LABEL in token.tag_:
                third_person_pronouns += 1

    return float(third_person_pronouns) / word_counter