Esempio n. 1
0
def analyze_anaphore_overlap(prev_sentence: Span,
                             cur_sentence: Span,
                             language: str = 'es') -> int:
    '''
    This function analyzes whether or not there's anaphore overlap between two sentences.

    Parameters:
    prev_sentence(Span): The previous sentence to analyze.
    cur_sentence(Span): The current sentence to analyze.
    language(str): The language of the sentences.

    Returns:
    int: 1 if there's overlap between the two sentences and 0 if no.
    '''
    # Place the tokens in a dictionary for search efficiency
    prev_sentence_pronoun_tokens = {
        token.text.lower(): None
        for token in prev_sentence if is_word(token) and token.pos_ == 'PRON'
    }

    for token in cur_sentence:
        if language == 'es':
            if is_word(token) and token.pos_ == 'PRON' and token.text.lower(
            ) in prev_sentence_pronoun_tokens:
                return 1  # There's cohesion

    return 0  # No cohesion
Esempio n. 2
0
    def get_noun_incidence(self,
                           text: str,
                           word_count: int = None,
                           workers: int = -1) -> float:
        '''
        This method calculates the incidence of nouns in a text per {self._incidence} words.

        Parameters:
        text(str): The text to be analyzed.
        word_count(int): The amount of words in the text.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        float: The incidence of nouns per {self._incidence} words.
        '''
        noun_counter = lambda doc: sum(1 for token in doc if is_word(token) and
                                       token.pos_ in ['NOUN', 'PROPN'])
        disable_pipeline = [
            pipe for pipe in self._nlp.pipe_names
            if pipe not in ['tagger', 'feature counter']
        ]

        return self._get_word_type_incidence(text,
                                             disable_pipeline=disable_pipeline,
                                             counter_function=noun_counter,
                                             workers=workers)
Esempio n. 3
0
    def get_personal_pronoun_third_person_plural_form_incidence(
            self,
            text: str,
            word_count: int = None,
            workers: int = -1) -> float:
        '''
        This method calculates the incidence of personal pronouns in third person and plural form in a text per {self._incidence} words.

        Parameters:
        text(str): The text to be analyzed.
        word_count(int): The amount of words in the text.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        float: The incidence of personal pronouns in third person and plural form per {self._incidence} words.
        '''
        if self.language == 'es':
            pronoun_counter = lambda doc: sum(1 for token in doc if is_word(
                token) and token.pos_ == 'PRON' and 'Number=Plur' in token.tag_
                                              and 'Person=3' in token.tag_)

        disable_pipeline = [
            pipe for pipe in self._nlp.pipe_names
            if pipe not in ['tagger', 'feature counter']
        ]

        return self._get_word_type_incidence(text,
                                             disable_pipeline=disable_pipeline,
                                             counter_function=pronoun_counter,
                                             workers=workers)
    def get_type_token_ratio_between_all_words(self, text: str, workers=-1) -> float:
        """
        This method returns the type token ratio between all words of a text.

        Parameters:
        text(str): The text to be anaylized.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        float: The type token ratio between all words of a text.
        """
        if len(text) == 0:
            raise ValueError('The text is empty.')
        elif workers == 0 or workers < -1:
            raise ValueError('Workers must be -1 or any positive number greater than 0')
        else:
            paragraphs = split_text_into_paragraphs(text) # Obtain paragraphs
            threads = multiprocessing.cpu_count() if workers == -1 else workers
            tokens = []
            disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe != 'tagger']

            tokens = [token.text.lower()
                      for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)
                      for token in doc
                      if is_word(token)]

            return 0 if len(tokens) == 0 else len(set(tokens)) / len(tokens)
Esempio n. 5
0
    def get_syllables_per_word(self,
                               text: str,
                               workers=-1) -> StatisticsResults:
        """
        This method returns the average amount and standard deviation of syllables in each word.

        Parameters:
        text(str): The text to be anaylized.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        StatisticsResults: The mean and standard deviation of the amount in syllables in each word.
        """
        count_syllables_per_word = lambda doc: [
            len(token._.syllables) for token in doc
            if is_word(token) and token._.syllables is not None
        ]

        disable_pipeline = [
            pipe for pipe in self._nlp.pipe_names
            if pipe not in ['syllable splitter', 'feature counter']
        ]

        return self._get_mean_std_of_metric(
            text,
            disable_pipeline=disable_pipeline,
            counter_function=count_syllables_per_word,
            statistic_type='all',
            workers=workers)
Esempio n. 6
0
    def get_length_of_sentences(self,
                                text: str,
                                workers: int = -1) -> StatisticsResults:
        """
        This method returns the average amount and standard deviation of words in each sentence.

        Parameters:
        text(str): The text to be anaylized.
        language(str): The language of the text to be analyzed.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        StatisticsResults: The mean and standard deviation of the amount in words in each sentence.
        """
        count_length_of_sentences = lambda doc: [
            len([1 for token in sentence if is_word(token)])
            for sentence in doc.sents
        ]

        disable_pipeline = [
            pipe for pipe in self._nlp.pipe_names
            if pipe not in ['sentencizer', 'feature counter']
        ]

        return self._get_mean_std_of_metric(
            text,
            disable_pipeline=disable_pipeline,
            counter_function=count_length_of_sentences,
            statistic_type='all',
            workers=workers)
Esempio n. 7
0
def analyze_stem_overlap(prev_sentence: Span,
                         cur_sentence: Span,
                         language: str = 'es') -> int:
    '''
    This function analyzes whether or not there's stem overlap between two sentences.

    Parameters:
    prev_sentence(Span): The previous sentence to analyze.
    cur_sentence(Span): The current sentence to analyze.
    language(str): The language of the sentences.

    Returns:
    int: 1 if there's overlap between the two sentences and 0 if no.
    '''
    # Place the tokens in a dictionary for search efficiency
    prev_sentence_content_stem_tokens = {
        token.lemma_.lower(): None
        for token in prev_sentence if is_content_word(token)
    }

    for token in cur_sentence:
        if language == 'es':
            if is_word(token) and token.pos_ in [
                    'NOUN', 'PROPN'
            ] and token.lemma_.lower() in prev_sentence_content_stem_tokens:
                return 1  # There's cohesion

    return 0  # No cohesion
    def __call__(self, doc: Doc) -> Doc:
        '''
        This method will find the syllables for each token that is a word.

        Parameters:
        doc(Doc): A Spacy document.
        '''
        for token in doc: # Iterate every token
            if is_word(token):
                token._.syllables = self._dic.inserted(token.text).split('-')
        
        return doc
Esempio n. 9
0
def analyze_argument_overlap(prev_sentence: Span,
                             cur_sentence: Span,
                             language: str = 'es') -> int:
    '''
    This function analyzes whether or not there's argument overlap between two sentences.

    Parameters:
    prev_sentence(Span): The previous sentence to analyze.
    cur_sentence(Span): The current sentence to analyze.
    language(str): The language of the sentences.

    Returns:
    int: 1 if there's overlap between the two sentences and 0 if no.
    '''
    # Place the tokens in a dictionary for search efficiency
    prev_sentence_noun_tokens = {
        token.lemma_.lower(): None
        for token in prev_sentence if is_word(token) and token.pos_ == 'NOUN'
    }

    prev_sentence_personal_pronouns_tokens = {
        token.text.lower(): None
        for token in prev_sentence
        if is_word(token) and 'PronType=Prs' in token.tag_
    }

    for token in cur_sentence:  # Iterate every token of the current sentence
        if language == 'es':
            if is_word(token) and token.pos_ == 'NOUN' and token.lemma_.lower(
            ) in prev_sentence_noun_tokens:
                return 1  # There's cohesion by noun lemma

            if is_word(token
                       ) and 'PronType=Prs' in token.tag_ and token.text.lower(
                       ) in prev_sentence_personal_pronouns_tokens:
                return 1  # There's cohesion by personal pronoun

    return 0  # No cohesion
Esempio n. 10
0
def amount_of_words_before_main_verb(sentence: Span) -> int:
    '''
    This function counts the amount of words before the main verb of a sentence.

    Parameters:
    sentence(Span): The sentence to analyze, identified by a Spacy span.

    Returns:
    int: The amount of words before the main verb of a sentence.
    '''
    left_words = []

    for token in sentence:
        if token.pos_ in ['VERB', 'AUX'] and token.dep_ == 'ROOT':
            break
        else:
            if is_word(token):
                left_words.append(token.text)

    return len(left_words)
Esempio n. 11
0
    def get_word_count_from_text(self, text: str, workers: int = -1) -> int:
        """
        This method counts how many words a text has.

        Parameters:
        text(str): The text to be anaylized.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        int: The amount of words.
        """
        if len(text) == 0:
            raise ValueError('The text is empty.')
        elif workers == 0 or workers < -1:
            raise ValueError(
                'Workers must be -1 or any positive number greater than 0')
        else:
            paragraphs = split_text_into_paragraphs(text)  # Obtain paragraphs
            threads = multiprocessing.cpu_count() if workers == -1 else workers
            word_counter = lambda doc: sum(1 for token in doc
                                           if is_word(token))
            disable_pipeline = [
                pipe for pipe in self._nlp.pipe_names
                if pipe != 'feature counter'
            ]
            self._nlp.get_pipe(
                'feature counter').counter_function = word_counter

            total_words = sum(
                doc._.feature_count
                for doc in self._nlp.pipe(paragraphs,
                                          batch_size=threads,
                                          disable=disable_pipeline,
                                          n_process=threads))

            return total_words