Python unicode_to_bytes Examples, textacy.compat.unicode_to_bytes Python Examples

Example #1

0

Show file

File: write.py Project: GregBowyer/textacy

def write_json_lines(json_objects, filepath, mode='wt', encoding=None,
                     auto_make_dirs=False, ensure_ascii=False,
                     separators=(',', ':'), sort_keys=False):
    """
    Iterate over a stream of JSON objects, writing each to a separate line in
    file ``filepath`` but without a top-level JSON object (e.g. array).

    Args:
        json_objects (iterable[json]): iterable of valid JSON objects to be written
        filepath (str): /path/to/file on disk to which JSON objects will be written,
            where each line in the file is its own json object; for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n
                {"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str)
        encoding (str)
        auto_make_dirs (bool)
        ensure_ascii (bool)
        separators (tuple[str])
        sort_keys (bool)

    .. seealso:: https://docs.python.org/3/library/json.html#json.dump
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath, mode=mode, encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for json_object in json_objects:
            f.write(json.dumps(json_object,
                               ensure_ascii=ensure_ascii,
                               separators=separators,
                               sort_keys=sort_keys) + newline)

Example #2

0

Show file

def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    try:
        cld2_detect
    except NameError:
        raise ImportError(
            '`cld2-cffi` must be installed to use textacy\'s automatic language detection; '
            'you may do so via `pip install cld2-cffi` or `pip install textacy[lang]`.'
        )

    if is_python2:
        is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text),
                                                   bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
    if is_reliable is False:
        msg = 'Text language detected with low confidence; best guesses: %s'
        logger.warning(msg, best_guesses)
    return best_guesses[0][1]

Example #3

0

Show file

File: test_fileio.py Project: GregBowyer/textacy

 def test_read_write_file_bytes(self):
     expected = unicode_to_bytes(self.text)
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_bytes' + ext)
         fileio.write_file(expected, filename, mode='wb',
                           auto_make_dirs=True)
         observed = fileio.read_file(filename, mode='rb')
         self.assertEqual(observed, expected)

Example #4

0

Show file

File: test_fileio.py Project: GregBowyer/textacy

 def test_read_write_file_lines_bytes(self):
     expected = [unicode_to_bytes(sent.text) for sent in self.spacy_doc.sents]
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_lines_bytes' + ext)
         fileio.write_file_lines(expected, filename, mode='wb',
                                 auto_make_dirs=True)
         observed = [line.strip() for line
                     in fileio.read_file_lines(filename, mode='rb')]
         self.assertEqual(observed, expected)

Example #5

0

Show file

File: utils.py Project: winstonewert/textacy

def coerce_content_type(content, file_mode):
    """
    If the `content` to be written to file and the `file_mode` used to open it
    are incompatible (either bytes with text mode or unicode with bytes mode),
    try to coerce the content type so it can be written.
    """
    if 't' in file_mode and isinstance(content, compat.bytes_):
        return compat.bytes_to_unicode(content)
    elif 'b' in file_mode and isinstance(content, compat.unicode_):
        return compat.unicode_to_bytes(content)
    return content

Example #6

0

Show file

File: utils.py Project: chartbeat-labs/textacy

def coerce_content_type(content, file_mode):
    """
    If the `content` to be written to file and the `file_mode` used to open it
    are incompatible (either bytes with text mode or unicode with bytes mode),
    try to coerce the content type so it can be written.
    """
    if 't' in file_mode and isinstance(content, compat.bytes_type):
        return compat.bytes_to_unicode(content)
    elif 'b' in file_mode and isinstance(content, compat.unicode_type):
        return compat.unicode_to_bytes(content)
    return content

Example #7

0

Show file

def test_read_write_text_bytes(tmpdir):
    expected = compat.unicode_to_bytes(TEXT)
    for ext in ('.txt', '.gz', '.bz2', '.xz'):
        filename = str(tmpdir.join('test_read_write_file_bytes' + ext))
        if compat.is_python2 is True and ext == '.xz':
            with pytest.raises(ValueError):
                io.open_sesame(
                    filename, mode='wb', encoding='utf-8', make_dirs=True)
        else:
            io.write_text(expected, filename, mode='wb', make_dirs=True)
            observed = next(io.read_text(filename, mode='rb'))
            assert observed == expected

Example #8

0

Show file

File: write.py Project: GregBowyer/textacy

def write_file_lines(lines, filepath, mode='wt', encoding=None,
                     auto_make_dirs=False):
    """
    Write the content in ``lines`` to disk at ``filepath``, line by line. Files
    with appropriate extensions are compressed with gzip or bz2 automatically.
    Any intermediate folders not found on disk may automatically be created.
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath, mode=mode, encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for line in lines:
            f.write(line + newline)

Example #9

0

Show file

File: test_io.py Project: yashchoubey/textacy

def test_read_write_text_bytes(tmpdir):
    expected = compat.unicode_to_bytes(TEXT)
    for ext in (".txt", ".gz", ".bz2", ".xz"):
        filename = str(tmpdir.join("test_read_write_file_bytes" + ext))
        if compat.is_python2 is True and ext == ".xz":
            with pytest.raises(ValueError):
                io.open_sesame(filename,
                               mode="wb",
                               encoding="utf-8",
                               make_dirs=True)
        else:
            io.write_text(expected, filename, mode="wb", make_dirs=True)
            observed = next(io.read_text(filename, mode="rb"))
            assert observed == expected

Example #10

0

Show file

def test_read_write_text_lines_bytes(tmpdir, spacy_doc):
    expected = [compat.unicode_to_bytes(sent.text) for sent in spacy_doc.sents]
    for ext in ('.txt', '.gz', '.bz2', '.xz'):
        filename = str(tmpdir.join('test_read_write_file_lines_bytes' + ext))
        if compat.is_python2 is True and ext == '.xz':
            with pytest.raises(ValueError):
                io.open_sesame(
                    filename, mode='wb', encoding='utf-8', make_dirs=True)
        else:
            io.write_text(expected, filename, mode='wb', make_dirs=True, lines=True)
            observed = [
                line.strip()
                for line in io.read_text(filename, mode='rb', lines=True)]
            assert observed == expected

Example #11

0

Show file

 def test_read_write_file_bytes(self):
     expected = unicode_to_bytes(self.text)
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_bytes' + ext)
         if PY2 is True and ext == '.xz':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wb', 'utf-8', True)
         else:
             fileio.write_file(expected, filename, mode='wb',
                               auto_make_dirs=True)
             observed = fileio.read_file(filename, mode='rb')
             self.assertEqual(observed, expected)

Example #12

0

Show file

 def test_read_write_file_lines_bytes(self):
     expected = [unicode_to_bytes(sent.text) for sent in self.spacy_doc.sents]
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_lines_bytes' + ext)
         if PY2 is True and ext == '.xz':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wb', 'utf-8', True)
         else:
             fileio.write_file_lines(expected, filename, mode='wb',
                                     auto_make_dirs=True)
             observed = [line.strip() for line
                         in fileio.read_file_lines(filename, mode='rb')]
             self.assertEqual(observed, expected)

Example #13

0

Show file

File: write.py Project: winstonewert/textacy

def write_file_lines(lines,
                     filepath,
                     mode='wt',
                     encoding=None,
                     auto_make_dirs=False):
    """
    Write the content in ``lines`` to disk at ``filepath``, line by line. Files
    with appropriate extensions are compressed with gzip or bz2 automatically.
    Any intermediate folders not found on disk may automatically be created.
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath,
                     mode=mode,
                     encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for line in lines:
            f.write(line + newline)

Example #14

0

Show file

def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    if PY2:
        is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text), bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
    if is_reliable is False:
        msg = 'Text language detected with low confidence; best guesses: %s'
        logger.warning(msg, best_guesses)
    return best_guesses[0][1]

Example #15

0

Show file

File: test_io.py Project: yashchoubey/textacy

def test_read_write_text_lines_bytes(tmpdir, spacy_doc):
    expected = [compat.unicode_to_bytes(sent.text) for sent in spacy_doc.sents]
    for ext in (".txt", ".gz", ".bz2", ".xz"):
        filename = str(tmpdir.join("test_read_write_file_lines_bytes" + ext))
        if compat.is_python2 is True and ext == ".xz":
            with pytest.raises(ValueError):
                io.open_sesame(filename,
                               mode="wb",
                               encoding="utf-8",
                               make_dirs=True)
        else:
            io.write_text(expected,
                          filename,
                          mode="wb",
                          make_dirs=True,
                          lines=True)
            observed = [
                line.strip()
                for line in io.read_text(filename, mode="rb", lines=True)
            ]
            assert observed == expected

Example #16

0

Show file

File: text_utils.py Project: GregBowyer/textacy

def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    if PY2:
        is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text), bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
    if is_reliable is False:
        msg = 'Text language detected with low confidence; best guesses: %s'
        logger.warning(msg, best_guesses)
    return best_guesses[0][1]

Example #17

0

Show file

File: write.py Project: winstonewert/textacy

def write_json_lines(json_objects,
                     filepath,
                     mode='wt',
                     encoding=None,
                     auto_make_dirs=False,
                     ensure_ascii=False,
                     separators=(',', ':'),
                     sort_keys=False):
    """
    Iterate over a stream of JSON objects, writing each to a separate line in
    file ``filepath`` but without a top-level JSON object (e.g. array).

    Args:
        json_objects (iterable[json]): iterable of valid JSON objects to be written
        filepath (str): /path/to/file on disk to which JSON objects will be written,
            where each line in the file is its own json object; for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n
                {"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str)
        encoding (str)
        auto_make_dirs (bool)
        ensure_ascii (bool)
        separators (tuple[str])
        sort_keys (bool)

    .. seealso:: https://docs.python.org/3/library/json.html#json.dump
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath,
                     mode=mode,
                     encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for json_object in json_objects:
            f.write(
                json.dumps(json_object,
                           ensure_ascii=ensure_ascii,
                           separators=separators,
                           sort_keys=sort_keys) + newline)