def analyze_anaphore_overlap(prev_sentence: Span, cur_sentence: Span, language: str = 'es') -> int: ''' This function analyzes whether or not there's anaphore overlap between two sentences. Parameters: prev_sentence(Span): The previous sentence to analyze. cur_sentence(Span): The current sentence to analyze. language(str): The language of the sentences. Returns: int: 1 if there's overlap between the two sentences and 0 if no. ''' # Place the tokens in a dictionary for search efficiency prev_sentence_pronoun_tokens = { token.text.lower(): None for token in prev_sentence if is_word(token) and token.pos_ == 'PRON' } for token in cur_sentence: if language == 'es': if is_word(token) and token.pos_ == 'PRON' and token.text.lower( ) in prev_sentence_pronoun_tokens: return 1 # There's cohesion return 0 # No cohesion
def get_syllables_per_word(self, text: str, workers=-1) -> StatisticsResults: """ This method returns the average amount and standard deviation of syllables in each word. Parameters: text(str): The text to be anaylized. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: StatisticsResults: The mean and standard deviation of the amount in syllables in each word. """ count_syllables_per_word = lambda doc: [ len(token._.syllables) for token in doc if is_word(token) and token._.syllables is not None ] disable_pipeline = [ pipe for pipe in self._nlp.pipe_names if pipe not in ['syllable splitter', 'feature counter'] ] return self._get_mean_std_of_metric( text, disable_pipeline=disable_pipeline, counter_function=count_syllables_per_word, statistic_type='all', workers=workers)
def get_length_of_sentences(self, text: str, workers: int = -1) -> StatisticsResults: """ This method returns the average amount and standard deviation of words in each sentence. Parameters: text(str): The text to be anaylized. language(str): The language of the text to be analyzed. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: StatisticsResults: The mean and standard deviation of the amount in words in each sentence. """ count_length_of_sentences = lambda doc: [ len([1 for token in sentence if is_word(token)]) for sentence in doc.sents ] disable_pipeline = [ pipe for pipe in self._nlp.pipe_names if pipe not in ['sentencizer', 'feature counter'] ] return self._get_mean_std_of_metric( text, disable_pipeline=disable_pipeline, counter_function=count_length_of_sentences, statistic_type='all', workers=workers)
def analyze_stem_overlap(prev_sentence: Span, cur_sentence: Span, language: str = 'es') -> int: ''' This function analyzes whether or not there's stem overlap between two sentences. Parameters: prev_sentence(Span): The previous sentence to analyze. cur_sentence(Span): The current sentence to analyze. language(str): The language of the sentences. Returns: int: 1 if there's overlap between the two sentences and 0 if no. ''' # Place the tokens in a dictionary for search efficiency prev_sentence_content_stem_tokens = { token.lemma_.lower(): None for token in prev_sentence if is_content_word(token) } for token in cur_sentence: if language == 'es': if is_word(token) and token.pos_ in [ 'NOUN', 'PROPN' ] and token.lemma_.lower() in prev_sentence_content_stem_tokens: return 1 # There's cohesion return 0 # No cohesion
def __call__(self, doc: Doc) -> Doc: ''' This method will find the syllables for each token that is a word. Parameters: doc(Doc): A Spacy document. ''' for token in doc: # Iterate every token if is_word(token): token._.syllables = self._dic.inserted(token.text).split('-') return doc
def analyze_argument_overlap(prev_sentence: Span, cur_sentence: Span, language: str = 'es') -> int: ''' This function analyzes whether or not there's argument overlap between two sentences. Parameters: prev_sentence(Span): The previous sentence to analyze. cur_sentence(Span): The current sentence to analyze. language(str): The language of the sentences. Returns: int: 1 if there's overlap between the two sentences and 0 if no. ''' # Place the tokens in a dictionary for search efficiency prev_sentence_noun_tokens = { token.lemma_.lower(): None for token in prev_sentence if is_word(token) and token.pos_ == 'NOUN' } prev_sentence_personal_pronouns_tokens = { token.text.lower(): None for token in prev_sentence if is_word(token) and 'PronType=Prs' in token.tag_ } for token in cur_sentence: # Iterate every token of the current sentence if language == 'es': if is_word(token) and token.pos_ == 'NOUN' and token.lemma_.lower( ) in prev_sentence_noun_tokens: return 1 # There's cohesion by noun lemma if is_word(token ) and 'PronType=Prs' in token.tag_ and token.text.lower( ) in prev_sentence_personal_pronouns_tokens: return 1 # There's cohesion by personal pronoun return 0 # No cohesion
def get_verb_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float: ''' This method calculates the incidence of verbs in a text per {self._incidence} words. Parameters: text(str): The text to be analyzed. word_count(int): The amount of words in the text. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: float: The incidence of verbs per {self._incidence} words. ''' verb_counter = lambda doc: sum(1 for token in doc if is_word(token) and token.pos_ == 'VERB') disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tagger', 'feature counter']] return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=verb_counter, workers=workers)
def get_personal_pronoun_third_person_plural_form_incidence(self, text: str, word_count: int=None, workers: int=-1) -> float: ''' This method calculates the incidence of personal pronouns in third person and plural form in a text per {self._incidence} words. Parameters: text(str): The text to be analyzed. word_count(int): The amount of words in the text. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: float: The incidence of personal pronouns in third person and plural form per {self._incidence} words. ''' if self.language == 'es': pronoun_counter = lambda doc: sum(1 for token in doc if is_word(token) and token.pos_ == 'PRON' and 'Number=Plur' in token.tag_ and 'Person=3' in token.tag_) disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe not in ['tagger', 'feature counter']] return self._get_word_type_incidence(text, disable_pipeline=disable_pipeline, counter_function=pronoun_counter, workers=workers)
def amount_of_words_before_main_verb(sentence: Span) -> int: ''' This function counts the amount of words before the main verb of a sentence. Parameters: sentence(Span): The sentence to analyze, identified by a Spacy span. Returns: int: The amount of words before the main verb of a sentence. ''' left_words = [] for token in sentence: if token.pos_ in ['VERB', 'AUX'] and token.dep_ == 'ROOT': break else: if is_word(token): left_words.append(token.text) return len(left_words)
def get_word_count_from_text(self, text: str, workers: int = -1) -> int: """ This method counts how many words a text has. Parameters: text(str): The text to be anaylized. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: int: The amount of words. """ if len(text) == 0: raise ValueError('The text is empty.') elif workers == 0 or workers < -1: raise ValueError( 'Workers must be -1 or any positive number greater than 0') else: paragraphs = split_text_into_paragraphs(text) # Obtain paragraphs threads = multiprocessing.cpu_count() if workers == -1 else workers word_counter = lambda doc: sum(1 for token in doc if is_word(token)) disable_pipeline = [ pipe for pipe in self._nlp.pipe_names if pipe != 'feature counter' ] self._nlp.get_pipe( 'feature counter').counter_function = word_counter total_words = sum( doc._.feature_count for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)) return total_words
def get_type_token_ratio_between_all_words(self, text: str, workers=-1) -> float: """ This method returns the type token ratio between all words of a text. Parameters: text(str): The text to be anaylized. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: float: The type token ratio between all words of a text. """ if len(text) == 0: raise ValueError('The text is empty.') elif workers == 0 or workers < -1: raise ValueError( 'Workers must be -1 or any positive number greater than 0') else: paragraphs = split_text_into_paragraphs(text) # Obtain paragraphs threads = multiprocessing.cpu_count() if workers == -1 else workers tokens = [] disable_pipeline = [ pipe for pipe in self._nlp.pipe_names if pipe != 'tagger' ] tokens = [ token.text.lower() for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads) for token in doc if is_word(token) ] return 0 if len(tokens) == 0 else len(set(tokens)) / len(tokens)