Ejemplo n.º 1
0
def scrub(text: str, gutenberg: bool, lower: bool, punct: bool, apos: bool,
          hyphen: bool, amper: bool, digits: bool, tags: bool,
          white_space: bool, spaces: bool, tabs: bool, new_lines: bool,
          opt_uploads: Dict[str, FileStorage], storage_options: List[str],
          storage_folder: str, previewing: bool = False) -> str:
    """Scrubs the text according to the specifications chosen by the user.

    This function calls call_rlhandler, handle_tags(), remove_punctuation(),
    and remove_stopwords(), which manipulate the text.
    :param text: A unicode string representing the whole text that is being
        manipulated.
    :param gutenberg: A boolean indicating whether the text is a Project
        Gutenberg file.
    :param lower: A boolean indicating whether or not the text is converted to
        lowercase.
    :param punct: A boolean indicating whether to remove punctuation from the
        text.
    :param apos: A boolean indicating whether to keep apostrophes in the text.
    :param hyphen: A boolean indicating whether to keep hyphens in the text.
    :param amper: A boolean indicating whether to keep ampersands in the text.
    :param digits: A boolean indicating whether to remove digits from the text.
    :param tags: A boolean indicating whether Scrub Tags has been checked.
    :param white_space: A boolean indicating whether white spaces should be
        removed.
    :param spaces: A boolean indicating whether spaces should be removed.
    :param tabs: A boolean indicating whether tabs should be removed.
    :param new_lines: A boolean indicating whether newlines should be removed.
    :param opt_uploads: A dictionary (specifically ImmutableMultiDict)
        containing the additional scrubbing option files that have been
        uploaded.
    :param storage_options: A list of strings representing additional options
        that have been chosen by the user.
    :param storage_folder: A string representing the path of the storage
        folder.
    :param previewing: A boolean indicating whether the user is previewing.
    :return: A string representing the text after all the scrubbing.
    """

    storage_filenames = sorted(
        [constants.STOPWORD_FILENAME, constants.LEMMA_FILENAME,
         constants.CONSOLIDATION_FILENAME, constants.SPECIAL_CHAR_FILENAME])
    option_strings = prepare_additional_options(
        opt_uploads, storage_options, storage_folder, storage_filenames)

    # handle uploaded FILES: consolidations, lemmas, special characters,
    # stop-keep words
    cons_file_string = option_strings[0]
    lem_file_string = option_strings[1]
    sc_file_string = option_strings[2]
    sw_kw_file_string = option_strings[3]

    # handle manual entries: consolidations, lemmas, special characters,
    # stop-keep words
    cons_manual = option_strings[4]
    lem_manual = option_strings[5]
    sc_manual = option_strings[6]
    sw_kw_manual = option_strings[7]

    # Scrubbing order:
    #
    # Note:  lemmas and consolidations do NOT work on tags; in short,
    #        these manipulations do not change inside any tags
    #
    # 0. Gutenberg
    # 1. lower
    #    (not applied in tags ever;
    #    lemmas/consolidations/specialChars/stopKeepWords changed;
    #    text not changed at this point)
    # 2. special characters
    # 3. tags - scrub tags
    # 4. punctuation
    #    (hyphens, apostrophes, ampersands);
    #    text not changed at this point, not applied in tags ever
    # 5. digits (text not changed at this point, not applied in tags ever)
    # 6. white space (text not changed at this point, not applied in tags ever,
    #    otherwise tag attributes will be messed up)
    # 7. consolidations
    #    (text not changed at this point, not applied in tags ever)
    # 8. lemmatize (text not changed at this point, not applied in tags ever)
    # 9. stop words/keep words
    #    (text not changed at this point, not applied in tags ever)
    #
    # apply:
    # 0. remove Gutenberg boiler plate (if any)
    # 1. lowercase
    # 2. consolidation
    # 3. lemmatize
    # 4. stop words
    # 5. remove punctuation, digits, and whitespace without changing all the
    # content in the tag
    #

    # -- 0. Gutenberg --------------------------------------------------------

    # gutenberg is True if LexosFile finds the (case-sensitive) string:
    #     "\*\*\* START OF THIS PROJECT GUTENBERG" + <Some title> + "\*\*\*"
    if gutenberg:
        text = handle_gutenberg(text)

    # -- 1. lower ------------------------------------------------------------
    if lower:  # user want to ignore case
        def to_lower_function(orig_text: str) -> str:
            """Removes capital letters from a text.

            :param orig_text: A mixed-case string.
            :return: The text with all caps converted to lowercase.
            """

            return orig_text.lower()

        # since lower is ON, apply lowercase to other options
        # apply to contents of any uploaded files
        cons_file_string = cons_file_string.lower()
        lem_file_string = lem_file_string.lower()
        sc_file_string = sc_file_string.lower()
        sw_kw_file_string = sw_kw_file_string.lower()

        # apply to contents manually entered
        cons_manual = cons_manual.lower()
        lem_manual = lem_manual.lower()
        sc_manual = sc_manual.lower()
        sw_kw_manual = sw_kw_manual.lower()

    else:
        def to_lower_function(orig_text: str) -> str:
            """Returns the string it is passed.

            :param orig_text: A text string.
            :return: orig_text, unchanged.
            """

            return orig_text

    # -- 2. special characters -----------------------------------------------
    merged_string = handle_file_and_manual_strings(
        file_string=sc_file_string, manual_string=sc_manual,
        storage_folder=storage_folder, storage_filenames=storage_filenames,
        storage_number=2)

    # "\n" comes from "" + "\n" + ""
    if merged_string == "\n":
        text = handle_special_characters(text)
    else:
        text = replacement_handler(
            text=text, replacer_string=merged_string, is_lemma=False)

    # -- 3. tags (if Remove Tags is checked)----------------------------------
    if tags:  # If remove tags is checked:
        text = handle_tags(text)

    # -- 4. punctuation (hyphens, apostrophes, ampersands) -------------------
    if punct:
        # remove_punctuation_map alters the text (both for apos and hyphens),
        # thus the updated must be returned
        text, remove_punctuation_map = get_remove_punctuation_map(
            text, apos, hyphen, amper, previewing)
    else:
        remove_punctuation_map = {}

    # -- 5. digits -----------------------------------------------------------
    if digits:
        remove_digits_map = get_remove_digits_map()
    else:
        remove_digits_map = {}

    # -- 6. whitespace ------------------------------------------------------

    if white_space:
        remove_whitespace_map = get_remove_whitespace_map(
            spaces, tabs, new_lines)
    else:
        remove_whitespace_map = {}

    # -- create total removal function -----------------------------
    # merge all the removal map
    total_removal_map = remove_punctuation_map.copy()
    total_removal_map.update(remove_digits_map)
    total_removal_map.update(remove_whitespace_map)

    # create a remove function
    def total_removal_function(orig_text: str) -> str:
        """Removes the characters specified by total_removal_map.

        :param orig_text: A text string.
        :return: The text string, with removal characters deleted.
        """

        return orig_text.translate(total_removal_map)

    # -- 7. consolidations ---------------------------------------------------
    def consolidation_function(orig_text: str) -> str:
        """Replaces characters according to user input strings.

        :param orig_text: A text string.
        :return: The text with characters swapped according to cons_file_string
            and cons_manual.
        """

        replacer_string = handle_file_and_manual_strings(
            file_string=cons_file_string, manual_string=cons_manual,
            storage_folder=storage_folder, storage_filenames=storage_filenames,
            storage_number=0)
        return replacement_handler(
            text=orig_text, replacer_string=replacer_string, is_lemma=False)

    # -- 8. lemmatize --------------------------------------------------------
    def lemmatize_function(orig_text: str) -> str:
        """Replaces words according to user input strings.

        :param orig_text: A text string.
        :return: The text with words swapped according to lem_file_string and
            lem_manual.
        """

        replacer_string = handle_file_and_manual_strings(
            file_string=lem_file_string, manual_string=lem_manual,
            storage_folder=storage_folder, storage_filenames=storage_filenames,
            storage_number=1)
        return replacement_handler(
            text=orig_text, replacer_string=replacer_string, is_lemma=True)

    # -- 9. stop words/keep words --------------------------------------------
    def stop_keep_words_function(orig_text: str) -> str:
        """Deletes certain words according to user input strings.

        :param orig_text: A text string.
        :return: If "stop" was chosen, returns the text with all words in
            sw_kw_file_string and sw_kw_manual deleted. If "keep" was chosen,
            returns the text with all words not in sw_kw_file_string and
            sw_kw_manual deleted.
        """

        file_and_manual = handle_file_and_manual_strings(
            file_string=sw_kw_file_string, manual_string=sw_kw_manual,
            storage_folder=storage_folder, storage_filenames=storage_filenames,
            storage_number=3)

        # if file_and_manual does not contain words there is no issue calling
        # remove_stopwords()
        if request.form['sw_option'] == "stop":
            return remove_stopwords(
                text=orig_text, removal_string=file_and_manual)

        # but all the text would be deleted if we called keep_words()
        # "\n" comes from "" + "\n" + ""
        elif request.form['sw_option'] == "keep" and file_and_manual != "\n":
            return keep_words(
                text=orig_text, non_removal_string=file_and_manual)

        else:
            return orig_text

    # apply all the functions and exclude tag
    text = general_functions.apply_function_exclude_tags(
        input_string=text, functions=[to_lower_function,
                                      consolidation_function,
                                      lemmatize_function,
                                      stop_keep_words_function,
                                      total_removal_function])

    finished_text = re.sub(r"[\s]+", " ", text, re.UNICODE | re.MULTILINE)

    return finished_text
Ejemplo n.º 2
0
 def test_empty_string(self):
     input_str = ""
     assert apply_function_exclude_tags(
         input_str, [self.dummy_function]) == ''
     assert apply_function_exclude_tags(
         input_str, [str.upper]) == ''
Ejemplo n.º 3
0
 def test_tags_only(self):
     input_str = "<tag></tag>"
     assert apply_function_exclude_tags(
         input_str, [self.dummy_function]) == '<tag></tag>'
     assert apply_function_exclude_tags(
         input_str, [str.upper]) == '<tag></tag>'
Ejemplo n.º 4
0
 def test_two_functions(self):
     input_str = "<tag>asdf</tag>"
     assert apply_function_exclude_tags(
         input_str, [str.upper, self.dummy_function]) == '<tag>' \
                                                         'ASDFASDF' \
                                                         '</tag>'
Ejemplo n.º 5
0
 def test_multiple_functions(self):
     assert apply_function_exclude_tags(
         '<tag>asdf</tag>', [str.upper, str.lower,
                             self.dummy_function]) == '<tag>asdfasdf</tag>'
Ejemplo n.º 6
0
def scrub(text: str,
          gutenberg: bool,
          lower: bool,
          punct: bool,
          apos: bool,
          hyphen: bool,
          amper: bool,
          digits: bool,
          tags: bool,
          spaces: bool,
          tabs: bool,
          new_lines: bool,
          opt_uploads: Dict[str, FileStorage],
          storage_options: List[str],
          storage_folder: str,
          previewing: bool = False) -> str:
    """Scrubs the text according to the specifications chosen by the user.

    This function calls call_rlhandler, handle_tags(), remove_punctuation(),
    and remove_stopwords(), which manipulate the text.
    :param text: A unicode string representing the whole text that is being
        manipulated.
    :param gutenberg: A boolean indicating whether the text is a Project
        Gutenberg file.
    :param lower: A boolean indicating whether or not the text is converted to
        lowercase.
    :param punct: A boolean indicating whether to remove punctuation from the
        text.
    :param apos: A boolean indicating whether to keep apostrophes in the text.
    :param hyphen: A boolean indicating whether to keep hyphens in the text.
    :param amper: A boolean indicating whether to keep ampersands in the text.
    :param digits: A boolean indicating whether to remove digits from the text.
    :param tags: A boolean indicating whether Scrub Tags has been checked.
    :param spaces: A boolean indicating whether spaces should be removed.
    :param tabs: A boolean indicating whether tabs should be removed.
    :param new_lines: A boolean indicating whether newlines should be removed.
    :param opt_uploads: A dictionary (specifically ImmutableMultiDict)
        containing the additional scrubbing option files that have been
        uploaded.
    :param storage_options: A list of strings representing additional options
        that have been chosen by the user.
    :param storage_folder: A string representing the path of the storage
        folder.
    :param previewing: A boolean indicating whether the user is previewing.
    :return: A string representing the text after all the scrubbing.
    """

    storage_filenames = sorted([
        constants.STOPWORD_FILENAME, constants.LEMMA_FILENAME,
        constants.CONSOLIDATION_FILENAME, constants.SPECIAL_CHAR_FILENAME
    ])
    option_strings = prepare_additional_options(opt_uploads, storage_options,
                                                storage_folder,
                                                storage_filenames)

    # handle uploaded FILES: consolidations, lemmas, special characters,
    # stop-keep words
    cons_file_string = option_strings[0]
    lem_file_string = option_strings[1]
    sc_file_string = option_strings[2]
    sw_kw_file_string = option_strings[3]

    # handle manual entries: consolidations, lemmas, special characters,
    # stop-keep words
    cons_manual = option_strings[4]
    lem_manual = option_strings[5]
    sc_manual = option_strings[6]
    sw_kw_manual = option_strings[7]

    # Scrubbing order:
    #
    # Note:  lemmas and consolidations do NOT work on tags; in short,
    #        these manipulations do not change inside any tags
    #
    # 0. Gutenberg
    # 1. lower
    #    (not applied in tags ever;
    #    lemmas/consolidations/specialChars/stopKeepWords changed;
    #    text not changed at this point)
    # 2. special characters
    # 3. tags - scrub tags
    # 4. punctuation
    #    (hyphens, apostrophes, ampersands);
    #    text not changed at this point, not applied in tags ever
    # 5. digits (text not changed at this point, not applied in tags ever)
    # 6. white space (text not changed at this point, not applied in tags ever,
    #    otherwise tag attributes will be messed up)
    # 7. consolidations
    #    (text not changed at this point, not applied in tags ever)
    # 8. lemmatize (text not changed at this point, not applied in tags ever)
    # 9. stop words/keep words
    #    (text not changed at this point, not applied in tags ever)
    #
    # apply:
    # 0. remove Gutenberg boiler plate (if any)
    # 1. lowercase
    # 2. consolidation
    # 3. lemmatize
    # 4. stop words
    # 5. remove punctuation, digits, and whitespace without changing all the
    # content in the tag
    #

    # -- 0. Gutenberg --------------------------------------------------------

    # gutenberg is True if LexosFile finds the (case-sensitive) string:
    #     "\*\*\* START OF THIS PROJECT GUTENBERG" + <Some title> + "\*\*\*"
    if gutenberg:
        text = handle_gutenberg(text)

    # -- 1. lower ------------------------------------------------------------
    if lower:  # user want to ignore case

        def to_lower_function(orig_text: str) -> str:
            """Removes capital letters from a text.

            :param orig_text: A mixed-case string.
            :return: The text with all caps converted to lowercase.
            """

            return orig_text.lower()

        # since lower is ON, apply lowercase to other options
        # apply to contents of any uploaded files
        cons_file_string = cons_file_string.lower()
        lem_file_string = lem_file_string.lower()
        sc_file_string = sc_file_string.lower()
        sw_kw_file_string = sw_kw_file_string.lower()

        # apply to contents manually entered
        cons_manual = cons_manual.lower()
        lem_manual = lem_manual.lower()
        sc_manual = sc_manual.lower()
        sw_kw_manual = sw_kw_manual.lower()

    else:

        def to_lower_function(orig_text: str) -> str:
            """Returns the string it is passed.

            :param orig_text: A text string.
            :return: orig_text, unchanged.
            """

            return orig_text

    # -- 2. special characters -----------------------------------------------
    merged_string = handle_file_and_manual_strings(
        file_string=sc_file_string,
        manual_string=sc_manual,
        storage_folder=storage_folder,
        storage_filenames=storage_filenames,
        storage_number=2)

    # Get form values
    charset = request.form['special_characters_preset']
    special_characters = request.form['special_characters']

    # determine if text is to be html escaped
    if charset == 'HTML':
        escape_html = True

    # "\n" comes from "" + "\n" + ""
    if merged_string == "\n":
        text = handle_special_characters(text, charset, special_characters)
    else:
        text = replacement_handler(text=text,
                                   replacer_string=merged_string,
                                   is_lemma=False,
                                   escape_html=escape_html)

    # -- 3. tags (if Remove Tags is checked)----------------------------------
    if tags:  # If remove tags is checked:
        text = handle_tags(text)

    # -- 4. punctuation (hyphens, apostrophes, ampersands) -------------------
    if punct:
        # remove_punctuation_map alters the text (both for apos and hyphens),
        # thus the updated must be returned
        text, remove_punctuation_map = get_remove_punctuation_map(
            text, apos, hyphen, amper, previewing)
    else:
        remove_punctuation_map = {}

    # -- 5. digits -----------------------------------------------------------
    # will be applied at end if needed
    # if digits:
    #    get_remove_digits(text)

    # -- 6. whitespace ------------------------------------------------------

    if spaces or tabs or new_lines:
        remove_whitespace_map = get_remove_whitespace_map(
            spaces, tabs, new_lines)
    else:
        remove_whitespace_map = {}

    # -- create total removal function -----------------------------
    # merge all the removal map
    total_removal_map = remove_punctuation_map.copy()
    total_removal_map.update(remove_whitespace_map)

    # create a remove function
    def total_removal_function(orig_text: str) -> str:
        """Removes the characters specified by total_removal_map.

        :param orig_text: A text string.
        :return: The text string, with removal characters deleted.
        """
        return orig_text.translate(total_removal_map)

    # -- 7. consolidations ---------------------------------------------------
    def consolidation_function(orig_text: str) -> str:
        """Replaces characters according to user input strings.

        :param orig_text: A text string.
        :return: The text with characters swapped according to cons_file_string
            and cons_manual.
        """

        replacer_string = handle_file_and_manual_strings(
            file_string=cons_file_string,
            manual_string=cons_manual,
            storage_folder=storage_folder,
            storage_filenames=storage_filenames,
            storage_number=0)
        text = replacement_handler(text=orig_text,
                                   replacer_string=replacer_string,
                                   is_lemma=False)
        return text

    # -- 8. lemmatize --------------------------------------------------------
    def lemmatize_function(orig_text: str) -> str:
        """Replaces words according to user input strings.

        :param orig_text: A text string.
        :return: The text with words swapped according to lem_file_string and
            lem_manual.
        """

        replacer_string = handle_file_and_manual_strings(
            file_string=lem_file_string,
            manual_string=lem_manual,
            storage_folder=storage_folder,
            storage_filenames=storage_filenames,
            storage_number=1)
        return replacement_handler(text=orig_text,
                                   replacer_string=replacer_string,
                                   is_lemma=True)

    # -- 9. stop words/keep words --------------------------------------------
    def stop_keep_words_function(orig_text: str) -> str:
        """Deletes certain words according to user input strings.

        :param orig_text: A text string.
        :return: If "stop" was chosen, returns the text with all words in
            sw_kw_file_string and sw_kw_manual deleted. If "keep" was chosen,
            returns the text with all words not in sw_kw_file_string and
            sw_kw_manual deleted.
        """

        file_and_manual = handle_file_and_manual_strings(
            file_string=sw_kw_file_string,
            manual_string=sw_kw_manual,
            storage_folder=storage_folder,
            storage_filenames=storage_filenames,
            storage_number=3)

        # if file_and_manual does not contain words there is no issue calling
        # remove_stopwords()
        if request.form['stop_words_method'] == "Stop":
            return remove_stopwords(text=orig_text,
                                    removal_string=file_and_manual)

        # but all the text would be deleted if we called keep_words()
        # "\n" comes from "" + "\n" + ""
        elif request.form['stop_words_method'] == "Keep" \
                and file_and_manual != "\n":
            return keep_words(text=orig_text,
                              non_removal_string=file_and_manual)

        else:
            return orig_text

    # apply all the functions and exclude tag
    functions = [
        to_lower_function, consolidation_function, lemmatize_function,
        total_removal_function, stop_keep_words_function
    ]
    if lower:
        functions.insert(0, to_lower_function)

    if digits:
        functions.insert(3, get_remove_digits)

    if tags:
        text = general_functions.apply_function_exclude_tags(
            input_string=text, functions=functions)
    else:
        text = general_functions.apply_function_no_tags(input_string=text,
                                                        functions=functions)

    return text
Ejemplo n.º 7
0
 def test_one_function(self):
     input_str = "<tag>asdf</tag>"
     assert apply_function_exclude_tags(
         input_str, [self.dummy_function]) == '<tag>asdfasdf</tag>'
     assert apply_function_exclude_tags(
         input_str, [str.upper]) == '<tag>ASDF</tag>'
Ejemplo n.º 8
0
 def test_one_function(self):
     input_str = "<tag>asdf</tag>"
     assert apply_function_exclude_tags(
         input_str, [self.dummy_function]) == '<tag>asdfasdf</tag>'
     assert apply_function_exclude_tags(
         input_str, [str.upper]) == '<tag>ASDF</tag>'
Ejemplo n.º 9
0
 def test_tags_only(self):
     input_str = "<tag></tag>"
     assert apply_function_exclude_tags(
         input_str, [self.dummy_function]) == '<tag></tag>'
     assert apply_function_exclude_tags(
         input_str, [str.upper]) == '<tag></tag>'
Ejemplo n.º 10
0
 def test_empty_string(self):
     input_str = ""
     assert apply_function_exclude_tags(
         input_str, [self.dummy_function]) == ''
     assert apply_function_exclude_tags(
         input_str, [str.upper]) == ''
Ejemplo n.º 11
0
 def test_multiple_functions(self):
     assert apply_function_exclude_tags(
         '<tag>asdf</tag>', [str.upper, str.lower,
                             self.dummy_function]) == '<tag>asdfasdf</tag>'
Ejemplo n.º 12
0
 def test_two_functions(self):
     input_str = "<tag>asdf</tag>"
     assert apply_function_exclude_tags(
         input_str, [str.upper, self.dummy_function]) == '<tag>' \
                                                         'ASDFASDF' \
                                                         '</tag>'