def _get_syntactic_pattern_density(self, text: str, disable_pipeline: List, sp_counter_function: Callable=None, word_count: int=None, workers: int=-1) -> int: ''' This function obtains the incidence of a syntactic pattern that exist on a text per {self._incidence} words. Parameters: text(str): The text to be analized. disable_pipeline(List): The pipeline elements to be disabled. sp_counter_function(Callable): The function that counts a syntactic pattern for a Spacy document. It returns an integer. word_count(int): The amount of words in the text. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: int: The incidence of a syntactic pattern per {self._incidence} words. ''' if len(text) == 0: raise ValueError('The word is empty.') elif workers == 0 or workers < -1: raise ValueError('Workers must be -1 or any positive number greater than 0') else: paragraphs = split_text_into_paragraphs(text) # Find all paragraphs threads = multiprocessing.cpu_count() if workers == -1 else workers wc = word_count if word_count is not None else self._di.get_word_count_from_text(text) self._nlp.get_pipe('feature counter').counter_function = sp_counter_function density = sum(doc._.feature_count for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)) # Calculate with multiprocessing return (density / wc) * self._incidence
async def convert_pdf_to_txt(pdf_path: str, save_dir: str) -> None: """ This function converts a pdf file to a txt file. It cleans the text. Parameters: pdf_path (str): The path where the pdf to covert is located save_dir (str): The path where to save the converted pdf Returns: None """ if not hasattr(convert_pdf_to_txt, 'nlp'): convert_pdf_to_txt.nlp = spacy.load(ACCEPTED_LANGUAGES['es']) convert_pdf_to_txt.nlp.add_pipe(convert_pdf_to_txt.nlp.create_pipe('sentencizer')) try: tika.initVM() pdf_file = parser.from_file(pdf_path) async with AIOFile(save_dir, 'w') as text_file: doc = convert_pdf_to_txt.nlp(pdf_file['content']) #print(doc) text = ''.join([re.sub(r'[,|;|\b]\n+\b', '\n', re.sub(r'\b\n+\b', '\n', s.text)) for s in doc.sents]) # Fix sentences that have more newlines than they should paragraphs = split_text_into_paragraphs(text) # Eliminate extra newlines between paragraphs new_text = '\n\n'.join(paragraphs) new_text = re.sub(r'-\s*\n+', '', new_text) # Join split words. print(new_text) await text_file.write(new_text) except Exception as e: raise e
def get_type_token_ratio_of_content_words(self, text: str, workers=-1) -> float: """ This method returns the type token ratio of content words of a text. Content words are nouns, verbs, adjectives and adverbs. Parameters: text(str): The text to be anaylized. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: float: The type token ratio between the content words of a text. """ if len(text) == 0: raise ValueError('The text is empty.') elif workers == 0 or workers < -1: raise ValueError('Workers must be -1 or any positive number greater than 0') else: paragraphs = split_text_into_paragraphs(text) # Obtain paragraphs threads = multiprocessing.cpu_count() if workers == -1 else workers tokens = [] disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe != 'tagger'] tokens = [token.text.lower() for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads) for token in doc if is_content_word(token)] return 0 if len(tokens) == 0 else len(set(tokens)) / len(tokens)
def _get_mean_std_of_metric(self, text: str, disable_pipeline: List, counter_function: Callable, statistic_type: str = 'all', workers=-1) -> StatisticsResults: """ This method returns the mean and/or standard deviation of a descriptive metric. Parameters: text(str): The text to be anaylized. disable_pipeline(List): The pipeline elements to be disabled. counter_function(Callable): This callable will calculate the values to add to the counter array in order to calculate the standard deviation. It receives a Spacy Doc and it should return a list or number. statistic_type(str): Whether to calculate the mean and/or the standard deviation. It accepts 'mean', 'std' or 'all'. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: StatisticsResults: The mean and/or standard deviation of the current metric. """ if len(text) == 0: raise ValueError('The text is empty.') elif statistic_type not in ['mean', 'std', 'all']: raise ValueError( '\'statistic_type\' can only take \'mean\', \'std\' or \'all\'.' ) elif workers == 0 or workers < -1: raise ValueError( 'Workers must be -1 or any positive number greater than 0') else: paragraphs = split_text_into_paragraphs(text) # Obtain paragraphs threads = multiprocessing.cpu_count() if workers == -1 else workers self._nlp.get_pipe( 'feature counter').counter_function = counter_function counter = [] for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads): current_result = doc._.feature_count # Find the values to add to the counter if not isinstance(current_result, list): # Add any numbers counter.append(current_result) else: if len(current_result ) > 0: # Only add values if its not an empty array counter.extend(current_result) stat_results = StatisticsResults() if statistic_type in ['std', 'all']: stat_results.std = statistics.pstdev(counter) if statistic_type in ['mean', 'all']: stat_results.mean = statistics.mean(counter) return stat_results
def get_paragraph_count_from_text(self, text: str) -> int: """ This method counts how many paragarphs are there in a text Parameters: text(str): The text to be analyzed Returns: int: The amount of paragraphs in a text """ if len(text) == 0: raise ValueError('The text is empty.') return len(split_text_into_paragraphs(text))
def get_mean_number_of_words_before_main_verb(self, text: str, workers: int = -1) -> float: ''' This method calculates the mean number of words before the main verb of sentences. Parameters: text(str): The text to be analized. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: float: The mean of words before the main verb of sentences. ''' if len(text) == 0: raise ValueError('The word is empty.') elif workers == 0 or workers < -1: raise ValueError( 'Workers must be -1 or any positive number greater than 0') else: paragraphs = split_text_into_paragraphs( text) # Find all paragraphs threads = multiprocessing.cpu_count() if workers == -1 else workers words_before_main_verb = [] disable_pipeline = [ pipe for pipe in self._nlp.pipe_names if pipe not in ['sentencizer', 'parser', 'tagger', 'feature counter'] ] words_before_main_verb_counter = lambda doc: [ amount_of_words_before_main_verb(s) for s in split_doc_into_sentences(doc) ] self._nlp.get_pipe( 'feature counter' ).counter_function = words_before_main_verb_counter for doc in self._nlp.pipe( paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads): # Calculate with multiprocessing words_before_main_verb.extend(doc._.feature_count) return statistics.mean(words_before_main_verb)
def get_mean_number_of_modifiers_per_noun_phrase(self, text: str, workers: int = -1 ) -> float: ''' This method calculates the mean number of modifiers per noun phrase in a text. Parameters: text(str): The text to be analized. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: float: The mean of modifiers per noun phrases. ''' if len(text) == 0: raise ValueError('The word is empty.') elif workers == 0 or workers < -1: raise ValueError( 'Workers must be -1 or any positive number greater than 0') else: paragraphs = split_text_into_paragraphs( text) # Find all paragraphs threads = multiprocessing.cpu_count() if workers == -1 else workers modifiers_per_noun_phrase = [] disable_pipeline = [ pipe for pipe in self._nlp.pipe_names if pipe not in ['parser', 'tagger', 'noun phrase tagger', 'feature counter'] ] modifiers_counter = lambda doc: [ sum(1 for token in nph if token.pos_ == 'ADJ') for nph in doc._.noun_phrases ] self._nlp.get_pipe( 'feature counter').counter_function = modifiers_counter modifiers_per_noun_phrase = [] for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads): modifiers_per_noun_phrase.extend(doc._.feature_count) return statistics.mean(modifiers_per_noun_phrase)
def _get_connectives_incidence(self, text: str, disable_pipeline: List, count_connectives_function: Callable, word_count: int = None, workers: int = -1) -> float: """ This method returns the incidence per {self._incidence} words for any connectives. Parameters: text(str): The text to be analyzed. disable_pipeline(List): The elements of the pipeline to be disabled. count_connectives_function(Callable): The function that counts any type of connectives. It takes a Spacy Doc and returns an integer. word_count(int): The amount of words in the text. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: float: The incidence of any connectives per {self._incidence} words. """ if len(text) == 0: raise ValueError('The text is empty.') elif workers == 0 or workers < -1: raise ValueError( 'Workers must be -1 or any positive number greater than 0') else: paragraphs = split_text_into_paragraphs(text) # Obtain paragraphs threads = multiprocessing.cpu_count() if workers == -1 else workers wc = word_count if word_count is not None else self._di.get_word_count_from_text( text) self._nlp.get_pipe('feature counter' ).counter_function = count_connectives_function connectives = sum( doc._.feature_count for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)) return (connectives / wc) * self._incidence
def get_sentence_count_from_text(self, text: str, workers: int = -1) -> int: """ This method counts how many sentences a text has. Parameters: text(str): The text to be analyzed. workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used. Returns: int: The amount of sentences. """ if len(text) == 0: raise ValueError('The text is empty.') elif workers == 0 or workers < -1: raise ValueError( 'Workers must be -1 or any positive number greater than 0') else: paragraphs = split_text_into_paragraphs(text) # Obtain paragraphs threads = multiprocessing.cpu_count() if workers == -1 else workers disable_pipeline = [ pipe for pipe in self._nlp.pipe_names if pipe not in ['sentencizer', 'feature counter'] ] sentence_counter = lambda doc: sum(1 for _ in doc.sents) self._nlp.get_pipe( 'feature counter').counter_function = sentence_counter sentences = sum(doc._.feature_count for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)) return sentences