Python split_text_into_paragraphsの例、src.processing.utils.utils.split_text_into_paragraphs Pythonの例

コード例 #1

0

ファイルを表示

ファイル: syntactic_pattern_density_indices.py プロジェクト: Hans03430/Tesis_Chatbot

    def _get_syntactic_pattern_density(self, text: str, disable_pipeline: List, sp_counter_function: Callable=None, word_count: int=None, workers: int=-1) -> int:
        '''
        This function obtains the incidence of a syntactic pattern that exist on a text per {self._incidence} words.

        Parameters:
        text(str): The text to be analized.
        disable_pipeline(List): The pipeline elements to be disabled.
        sp_counter_function(Callable): The function that counts a syntactic pattern for a Spacy document. It returns an integer.
        word_count(int): The amount of words in the text.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        int: The incidence of a syntactic pattern per {self._incidence} words.
        '''
        if len(text) == 0:
            raise ValueError('The word is empty.')
        elif workers == 0 or workers < -1:
            raise ValueError('Workers must be -1 or any positive number greater than 0')
        else:
            paragraphs = split_text_into_paragraphs(text) # Find all paragraphs
            threads = multiprocessing.cpu_count() if workers == -1 else workers
            wc = word_count if word_count is not None else self._di.get_word_count_from_text(text)            
            self._nlp.get_pipe('feature counter').counter_function = sp_counter_function
            density = sum(doc._.feature_count
                          for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)) # Calculate with multiprocessing 
            
            return (density / wc) * self._incidence

コード例 #2

0

ファイルを表示

async def convert_pdf_to_txt(pdf_path: str, save_dir: str) -> None:
    """
    This function converts a pdf file to a txt file. It cleans the text.
    
    Parameters:
    pdf_path (str): The path where the pdf to covert is located
    save_dir (str): The path where to save the converted pdf
    
    Returns:
    None
    """
    if not hasattr(convert_pdf_to_txt, 'nlp'):
        convert_pdf_to_txt.nlp = spacy.load(ACCEPTED_LANGUAGES['es'])
        convert_pdf_to_txt.nlp.add_pipe(convert_pdf_to_txt.nlp.create_pipe('sentencizer'))
    try:
        tika.initVM()
        pdf_file = parser.from_file(pdf_path)
        async with AIOFile(save_dir, 'w') as text_file:
            doc = convert_pdf_to_txt.nlp(pdf_file['content'])
            #print(doc)
            text = ''.join([re.sub(r'[,|;|\b]\n+\b', '\n', re.sub(r'\b\n+\b', '\n', s.text))
                            for s in doc.sents]) # Fix sentences that have more newlines than they should
            paragraphs = split_text_into_paragraphs(text) # Eliminate extra newlines between paragraphs
            new_text = '\n\n'.join(paragraphs)
            new_text = re.sub(r'-\s*\n+', '', new_text) # Join split words.
            print(new_text)
            await text_file.write(new_text)

    except Exception as e:
        raise e

コード例 #3

0

ファイルを表示

ファイル: lexical_diversity_indices.py プロジェクト: Hans03430/Tesis_Chatbot

    def get_type_token_ratio_of_content_words(self, text: str, workers=-1) -> float:
        """
        This method returns the type token ratio of content words of a text. Content words are nouns, verbs, adjectives and adverbs.

        Parameters:
        text(str): The text to be anaylized.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        float: The type token ratio between the content words of a text.
        """
        if len(text) == 0:
            raise ValueError('The text is empty.')
        elif workers == 0 or workers < -1:
            raise ValueError('Workers must be -1 or any positive number greater than 0')
        else:
            paragraphs = split_text_into_paragraphs(text) # Obtain paragraphs
            threads = multiprocessing.cpu_count() if workers == -1 else workers
            tokens = []
            disable_pipeline = [pipe for pipe in self._nlp.pipe_names if pipe != 'tagger']

            tokens = [token.text.lower()
                      for doc in self._nlp.pipe(paragraphs, batch_size=threads, disable=disable_pipeline, n_process=threads)
                      for token in doc
                      if is_content_word(token)]

            return 0 if len(tokens) == 0 else len(set(tokens)) / len(tokens)

コード例 #4

0

ファイルを表示

    def _get_mean_std_of_metric(self,
                                text: str,
                                disable_pipeline: List,
                                counter_function: Callable,
                                statistic_type: str = 'all',
                                workers=-1) -> StatisticsResults:
        """
        This method returns the mean and/or standard deviation of a descriptive metric.

        Parameters:
        text(str): The text to be anaylized.
        disable_pipeline(List): The pipeline elements to be disabled.
        counter_function(Callable): This callable will calculate the values to add to the counter array in order to calculate the standard deviation. It receives a Spacy Doc and it should return a list or number.
        statistic_type(str): Whether to calculate the mean and/or the standard deviation. It accepts 'mean', 'std' or 'all'.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        StatisticsResults: The mean and/or standard deviation of the current metric.
        """
        if len(text) == 0:
            raise ValueError('The text is empty.')
        elif statistic_type not in ['mean', 'std', 'all']:
            raise ValueError(
                '\'statistic_type\' can only take \'mean\', \'std\' or \'all\'.'
            )
        elif workers == 0 or workers < -1:
            raise ValueError(
                'Workers must be -1 or any positive number greater than 0')
        else:
            paragraphs = split_text_into_paragraphs(text)  # Obtain paragraphs
            threads = multiprocessing.cpu_count() if workers == -1 else workers
            self._nlp.get_pipe(
                'feature counter').counter_function = counter_function
            counter = []

            for doc in self._nlp.pipe(paragraphs,
                                      batch_size=threads,
                                      disable=disable_pipeline,
                                      n_process=threads):
                current_result = doc._.feature_count  # Find the values to add to the counter

                if not isinstance(current_result, list):  # Add any numbers
                    counter.append(current_result)
                else:
                    if len(current_result
                           ) > 0:  # Only add values if its not an empty array
                        counter.extend(current_result)

            stat_results = StatisticsResults()
            if statistic_type in ['std', 'all']:
                stat_results.std = statistics.pstdev(counter)

            if statistic_type in ['mean', 'all']:
                stat_results.mean = statistics.mean(counter)

            return stat_results

コード例 #5

0

ファイルを表示

    def get_paragraph_count_from_text(self, text: str) -> int:
        """
        This method counts how many paragarphs are there in a text

        Parameters:
        text(str): The text to be analyzed

        Returns:
        int: The amount of paragraphs in a text
        """
        if len(text) == 0:
            raise ValueError('The text is empty.')

        return len(split_text_into_paragraphs(text))

コード例 #6

0

ファイルを表示

    def get_mean_number_of_words_before_main_verb(self,
                                                  text: str,
                                                  workers: int = -1) -> float:
        '''
        This method calculates the mean number of words before the main verb of sentences.

        Parameters:
        text(str): The text to be analized.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        float: The mean of words before the main verb of sentences.
        '''
        if len(text) == 0:
            raise ValueError('The word is empty.')
        elif workers == 0 or workers < -1:
            raise ValueError(
                'Workers must be -1 or any positive number greater than 0')
        else:
            paragraphs = split_text_into_paragraphs(
                text)  # Find all paragraphs
            threads = multiprocessing.cpu_count() if workers == -1 else workers
            words_before_main_verb = []
            disable_pipeline = [
                pipe for pipe in self._nlp.pipe_names if pipe not in
                ['sentencizer', 'parser', 'tagger', 'feature counter']
            ]
            words_before_main_verb_counter = lambda doc: [
                amount_of_words_before_main_verb(s)
                for s in split_doc_into_sentences(doc)
            ]
            self._nlp.get_pipe(
                'feature counter'
            ).counter_function = words_before_main_verb_counter

            for doc in self._nlp.pipe(
                    paragraphs,
                    batch_size=threads,
                    disable=disable_pipeline,
                    n_process=threads):  # Calculate with multiprocessing
                words_before_main_verb.extend(doc._.feature_count)

            return statistics.mean(words_before_main_verb)

コード例 #7

0

ファイルを表示

    def get_mean_number_of_modifiers_per_noun_phrase(self,
                                                     text: str,
                                                     workers: int = -1
                                                     ) -> float:
        '''
        This method calculates the mean number of modifiers per noun phrase in a text.

        Parameters:
        text(str): The text to be analized.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        float: The mean of modifiers per noun phrases.
        '''
        if len(text) == 0:
            raise ValueError('The word is empty.')
        elif workers == 0 or workers < -1:
            raise ValueError(
                'Workers must be -1 or any positive number greater than 0')
        else:
            paragraphs = split_text_into_paragraphs(
                text)  # Find all paragraphs
            threads = multiprocessing.cpu_count() if workers == -1 else workers
            modifiers_per_noun_phrase = []
            disable_pipeline = [
                pipe for pipe in self._nlp.pipe_names if pipe not in
                ['parser', 'tagger', 'noun phrase tagger', 'feature counter']
            ]
            modifiers_counter = lambda doc: [
                sum(1 for token in nph if token.pos_ == 'ADJ')
                for nph in doc._.noun_phrases
            ]
            self._nlp.get_pipe(
                'feature counter').counter_function = modifiers_counter
            modifiers_per_noun_phrase = []

            for doc in self._nlp.pipe(paragraphs,
                                      batch_size=threads,
                                      disable=disable_pipeline,
                                      n_process=threads):
                modifiers_per_noun_phrase.extend(doc._.feature_count)

            return statistics.mean(modifiers_per_noun_phrase)

コード例 #8

0

ファイルを表示

ファイル: connective_indices.py プロジェクト: Hans03430/Tesis_Chatbot

    def _get_connectives_incidence(self,
                                   text: str,
                                   disable_pipeline: List,
                                   count_connectives_function: Callable,
                                   word_count: int = None,
                                   workers: int = -1) -> float:
        """
        This method returns the incidence per {self._incidence} words for any connectives.

        Parameters:
        text(str): The text to be analyzed.
        disable_pipeline(List): The elements of the pipeline to be disabled.
        count_connectives_function(Callable): The function that counts any type of connectives. It takes a Spacy Doc and returns an integer.
        word_count(int): The amount of words in the text.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        float: The incidence of any connectives per {self._incidence} words.
        """
        if len(text) == 0:
            raise ValueError('The text is empty.')
        elif workers == 0 or workers < -1:
            raise ValueError(
                'Workers must be -1 or any positive number greater than 0')
        else:
            paragraphs = split_text_into_paragraphs(text)  # Obtain paragraphs
            threads = multiprocessing.cpu_count() if workers == -1 else workers
            wc = word_count if word_count is not None else self._di.get_word_count_from_text(
                text)
            self._nlp.get_pipe('feature counter'
                               ).counter_function = count_connectives_function
            connectives = sum(
                doc._.feature_count
                for doc in self._nlp.pipe(paragraphs,
                                          batch_size=threads,
                                          disable=disable_pipeline,
                                          n_process=threads))

            return (connectives / wc) * self._incidence

コード例 #9

0

ファイルを表示

    def get_sentence_count_from_text(self,
                                     text: str,
                                     workers: int = -1) -> int:
        """
        This method counts how many sentences a text has.

        Parameters:
        text(str): The text to be analyzed.
        workers(int): Amount of threads that will complete this operation. If it's -1 then all cpu cores will be used.

        Returns:
        int: The amount of sentences.
        """
        if len(text) == 0:
            raise ValueError('The text is empty.')
        elif workers == 0 or workers < -1:
            raise ValueError(
                'Workers must be -1 or any positive number greater than 0')
        else:
            paragraphs = split_text_into_paragraphs(text)  # Obtain paragraphs
            threads = multiprocessing.cpu_count() if workers == -1 else workers
            disable_pipeline = [
                pipe for pipe in self._nlp.pipe_names
                if pipe not in ['sentencizer', 'feature counter']
            ]

            sentence_counter = lambda doc: sum(1 for _ in doc.sents)
            self._nlp.get_pipe(
                'feature counter').counter_function = sentence_counter

            sentences = sum(doc._.feature_count
                            for doc in self._nlp.pipe(paragraphs,
                                                      batch_size=threads,
                                                      disable=disable_pipeline,
                                                      n_process=threads))

            return sentences