Beispiel #1
0
    def add_upload_file(self, raw_file_string: bytes, file_name: str):
        """Detects (and applies) the encoding type of the file's contents.

        Since chardet runs slow, initially detects (only) MIN_ENCODING_DETECT
        chars; if that fails, chardet entire file for a fuller test
        :param raw_file_string: the file you want to detect the encoding
        :param file_name: name of the file
        """

        decoded_file_string = general_functions.decode_bytes(
            raw_bytes=raw_file_string)

        # Line encodings:
        # \n      Unix, OS X
        # \r      Mac OS 9
        # \r\n    Win. CR+LF
        # The following block converts everything to '\n'

        # "\r\n" -> '\n'
        if "\r\n" in decoded_file_string[:constants.MIN_NEWLINE_DETECT]:
            decoded_file_string = decoded_file_string.replace('\r', '')

        # '\r' -> '\n'
        if '\r' in decoded_file_string[:constants.MIN_NEWLINE_DETECT]:
            decoded_file_string = decoded_file_string.replace('\r', '\n')

        # Add the file to the FileManager
        self.add_file(file_name, file_name, decoded_file_string)
Beispiel #2
0
    def add_upload_file(self, raw_file_string: bytes, file_name: str):
        """Detects (and applies) the encoding type of the file's contents.

        Since chardet runs slow, initially detects (only) MIN_ENCODING_DETECT
        chars; if that fails, chardet entire file for a fuller test
        :param raw_file_string: the file you want to detect the encoding
        :param file_name: name of the file
        """

        decoded_file_string = general_functions.decode_bytes(
            raw_bytes=raw_file_string)

        # Line encodings:
        # \n      Unix, OS X
        # \r      Mac OS 9
        # \r\n    Win. CR+LF
        # The following block converts everything to '\n'

        # "\r\n" -> '\n'
        if "\r\n" in decoded_file_string[:constants.MIN_NEWLINE_DETECT]:
            decoded_file_string = decoded_file_string.replace('\r', '')

        # '\r' -> '\n'
        if '\r' in decoded_file_string[:constants.MIN_NEWLINE_DETECT]:
            decoded_file_string = decoded_file_string.replace('\r', '\n')

        # Add the file to the FileManager
        self.add_file(file_name, file_name, decoded_file_string)
Beispiel #3
0
def prepare_additional_options(opt_uploads: Dict[str, FileStorage],
                               storage_options: List[str], storage_folder: str,
                               storage_filenames: List[str]) -> List[str]:
    """Gathers all the strings used by the "Additional Options" scrub section.

    :param opt_uploads: A dictionary (specifically ImmutableMultiDict)
        containing the additional scrubbing option files that have been
        uploaded.
    :param storage_options: A list of strings representing additional options
        that have been chosen by the user.
    :param storage_folder: A string representing the path of the storage
        folder.
    :param storage_filenames: A list of filename strings that will be used to
        load and save the user's selections.
    :return: An array containing strings of all the additional scrubbing
        option text fields and files.
    """

    file_strings = {
        'consolidations_file[]': '',
        'lemmas_file[]': '',
        'special_characters_file[]': '',
        'stop_words_file[]': '',
        'consolidations': '',
        'lemmas': '',
        'special_characters': '',
        'stop_words': ''
    }

    for index, key in enumerate(sorted(opt_uploads)):
        if opt_uploads[key].filename:
            file_content = opt_uploads[key].read()
            file_strings[key] = general_functions.decode_bytes(file_content)
            opt_uploads[key].seek(0)
        elif key.strip('[]') in storage_options:
            file_strings[key] = load_scrub_optional_upload(
                storage_folder, storage_filenames[index])
        else:
            session['scrubbingoptions']['file_uploads'][key] = ''
            file_strings[key] = ""

    # Create an array of option strings:
    # cons_file_string, lem_file_string, sc_file_string, sw_kw_file_string,
    #     cons_manual, lem_manual, sc_manual, and sw_kw_manual

    all_options = [
        file_strings.get('consolidations_file[]'),
        file_strings.get('lemmas_file[]'),
        file_strings.get('special_characters_file[]'),
        file_strings.get('stop_words_file[]'), request.form['consolidations'],
        request.form['lemmas'], request.form['special_characters'],
        request.form['stop_words']
    ]

    return all_options
Beispiel #4
0
def prepare_additional_options(opt_uploads: Dict[str, FileStorage],
                               storage_options: List[str], storage_folder: str,
                               storage_filenames: List[str]) -> List[str]:
    """Gathers all the strings used by the "Additional Options" scrub section.

    :param opt_uploads: A dictionary (specifically ImmutableMultiDict)
        containing the additional scrubbing option files that have been
        uploaded.
    :param storage_options: A list of strings representing additional options
        that have been chosen by the user.
    :param storage_folder: A string representing the path of the storage
        folder.
    :param storage_filenames: A list of filename strings that will be used to
        load and save the user's selections.
    :return: An array containing strings of all the additional scrubbing
        option text fields and files.
    """

    file_strings = {'consfileselect[]': '', 'lemfileselect[]': '',
                    'scfileselect[]': '', 'swfileselect[]': '',
                    'manualconsolidations': '', 'manuallemmas': '',
                    'manualspecialchars': '', 'manualstopwords': ''}

    for index, key in enumerate(sorted(opt_uploads)):
        if opt_uploads[key].filename:
            file_content = opt_uploads[key].read()
            file_strings[key] = general_functions.decode_bytes(file_content)
            opt_uploads[key].seek(0)
        elif key.strip('[]') in storage_options:
            file_strings[key] = load_scrub_optional_upload(
                storage_folder, storage_filenames[index])
        else:
            session['scrubbingoptions']['optuploadnames'][key] = ''
            file_strings[key] = ""

    # Create an array of option strings:
    # cons_file_string, lem_file_string, sc_file_string, sw_kw_file_string,
    #     cons_manual, lem_manual, sc_manual, and sw_kw_manual

    all_options = [file_strings.get('consfileselect[]'),
                   file_strings.get('lemfileselect[]'),
                   file_strings.get('scfileselect[]'),
                   file_strings.get('swfileselect[]'),
                   request.form['manualconsolidations'],
                   request.form['manuallemmas'],
                   request.form['manualspecialchars'],
                   request.form['manualstopwords']]

    return all_options
Beispiel #5
0
    def test_python_string_decoding(self):
        python_string = "Hello, world!"

        assert decode_bytes(python_string) == python_string
Beispiel #6
0
 def test_windows_1251_decoding(self):
     input_str = 'сегодняшнее домашнее задание.' \
                 ' Настенные часы висят на стене. '
     assert decode_bytes(input_str.encode('windows-1251')) == input_str
Beispiel #7
0
 def test_iso8859_1_decoding(self):
     assert decode_bytes('Äpple'.encode('iso-8859-1')) == 'Äpple'
Beispiel #8
0
 def test_utf8_decoding(self):
     assert decode_bytes(u'España'.encode('utf-8')) == 'España'
Beispiel #9
0
 def test_utf16_decoding(self):
     assert decode_bytes(u'абвгдежзийкл'.encode('utf-16')) == 'абвгдежзийкл'
Beispiel #10
0
 def test_gb2312_decoding(self):
     assert decode_bytes(u'做戏之说做戏之'.encode('gb2312')) == '做戏之说做戏之'
    def test_python_string_decoding(self):
        python_string = "Hello, world!"

        assert decode_bytes(python_string) == python_string
 def test_windows_1251_decoding(self):
     input_str = 'сегодняшнее домашнее задание.' \
                 ' Настенные часы висят на стене. '
     assert decode_bytes(input_str.encode('windows-1251')) == input_str
 def test_iso8859_1_decoding(self):
     assert decode_bytes('Äpple'.encode('iso-8859-1')) == 'Äpple'
 def test_utf8_decoding(self):
     assert decode_bytes(u'España'.encode('utf-8')) == 'España'
 def test_utf16_decoding(self):
     assert decode_bytes(u'абвгдежзийкл'.encode('utf-16')) == 'абвгдежзийкл'
 def test_gb2312_decoding(self):
     assert decode_bytes(u'做戏之说做戏之'.encode('gb2312')) == '做戏之说做戏之'