Example #1
0
def write_json_lines(json_objects, filepath, mode='wt', encoding=None,
                     auto_make_dirs=False, ensure_ascii=False,
                     separators=(',', ':'), sort_keys=False):
    """
    Iterate over a stream of JSON objects, writing each to a separate line in
    file ``filepath`` but without a top-level JSON object (e.g. array).

    Args:
        json_objects (iterable[json]): iterable of valid JSON objects to be written
        filepath (str): /path/to/file on disk to which JSON objects will be written,
            where each line in the file is its own json object; for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n
                {"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str)
        encoding (str)
        auto_make_dirs (bool)
        ensure_ascii (bool)
        separators (tuple[str])
        sort_keys (bool)

    .. seealso:: https://docs.python.org/3/library/json.html#json.dump
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath, mode=mode, encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for json_object in json_objects:
            f.write(json.dumps(json_object,
                               ensure_ascii=ensure_ascii,
                               separators=separators,
                               sort_keys=sort_keys) + newline)
Example #2
0
def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    try:
        cld2_detect
    except NameError:
        raise ImportError(
            '`cld2-cffi` must be installed to use textacy\'s automatic language detection; '
            'you may do so via `pip install cld2-cffi` or `pip install textacy[lang]`.'
        )

    if is_python2:
        is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text),
                                                   bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
    if is_reliable is False:
        msg = 'Text language detected with low confidence; best guesses: %s'
        logger.warning(msg, best_guesses)
    return best_guesses[0][1]
Example #3
0
 def test_read_write_file_bytes(self):
     expected = unicode_to_bytes(self.text)
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_bytes' + ext)
         fileio.write_file(expected, filename, mode='wb',
                           auto_make_dirs=True)
         observed = fileio.read_file(filename, mode='rb')
         self.assertEqual(observed, expected)
Example #4
0
 def test_read_write_file_lines_bytes(self):
     expected = [unicode_to_bytes(sent.text) for sent in self.spacy_doc.sents]
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_lines_bytes' + ext)
         fileio.write_file_lines(expected, filename, mode='wb',
                                 auto_make_dirs=True)
         observed = [line.strip() for line
                     in fileio.read_file_lines(filename, mode='rb')]
         self.assertEqual(observed, expected)
Example #5
0
def coerce_content_type(content, file_mode):
    """
    If the `content` to be written to file and the `file_mode` used to open it
    are incompatible (either bytes with text mode or unicode with bytes mode),
    try to coerce the content type so it can be written.
    """
    if 't' in file_mode and isinstance(content, compat.bytes_):
        return compat.bytes_to_unicode(content)
    elif 'b' in file_mode and isinstance(content, compat.unicode_):
        return compat.unicode_to_bytes(content)
    return content
Example #6
0
def coerce_content_type(content, file_mode):
    """
    If the `content` to be written to file and the `file_mode` used to open it
    are incompatible (either bytes with text mode or unicode with bytes mode),
    try to coerce the content type so it can be written.
    """
    if 't' in file_mode and isinstance(content, compat.bytes_type):
        return compat.bytes_to_unicode(content)
    elif 'b' in file_mode and isinstance(content, compat.unicode_type):
        return compat.unicode_to_bytes(content)
    return content
Example #7
0
def test_read_write_text_bytes(tmpdir):
    expected = compat.unicode_to_bytes(TEXT)
    for ext in ('.txt', '.gz', '.bz2', '.xz'):
        filename = str(tmpdir.join('test_read_write_file_bytes' + ext))
        if compat.is_python2 is True and ext == '.xz':
            with pytest.raises(ValueError):
                io.open_sesame(
                    filename, mode='wb', encoding='utf-8', make_dirs=True)
        else:
            io.write_text(expected, filename, mode='wb', make_dirs=True)
            observed = next(io.read_text(filename, mode='rb'))
            assert observed == expected
Example #8
0
def write_file_lines(lines, filepath, mode='wt', encoding=None,
                     auto_make_dirs=False):
    """
    Write the content in ``lines`` to disk at ``filepath``, line by line. Files
    with appropriate extensions are compressed with gzip or bz2 automatically.
    Any intermediate folders not found on disk may automatically be created.
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath, mode=mode, encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for line in lines:
            f.write(line + newline)
Example #9
0
def test_read_write_text_bytes(tmpdir):
    expected = compat.unicode_to_bytes(TEXT)
    for ext in (".txt", ".gz", ".bz2", ".xz"):
        filename = str(tmpdir.join("test_read_write_file_bytes" + ext))
        if compat.is_python2 is True and ext == ".xz":
            with pytest.raises(ValueError):
                io.open_sesame(filename,
                               mode="wb",
                               encoding="utf-8",
                               make_dirs=True)
        else:
            io.write_text(expected, filename, mode="wb", make_dirs=True)
            observed = next(io.read_text(filename, mode="rb"))
            assert observed == expected
Example #10
0
def test_read_write_text_lines_bytes(tmpdir, spacy_doc):
    expected = [compat.unicode_to_bytes(sent.text) for sent in spacy_doc.sents]
    for ext in ('.txt', '.gz', '.bz2', '.xz'):
        filename = str(tmpdir.join('test_read_write_file_lines_bytes' + ext))
        if compat.is_python2 is True and ext == '.xz':
            with pytest.raises(ValueError):
                io.open_sesame(
                    filename, mode='wb', encoding='utf-8', make_dirs=True)
        else:
            io.write_text(expected, filename, mode='wb', make_dirs=True, lines=True)
            observed = [
                line.strip()
                for line in io.read_text(filename, mode='rb', lines=True)]
            assert observed == expected
Example #11
0
 def test_read_write_file_bytes(self):
     expected = unicode_to_bytes(self.text)
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_bytes' + ext)
         if PY2 is True and ext == '.xz':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wb', 'utf-8', True)
         else:
             fileio.write_file(expected, filename, mode='wb',
                               auto_make_dirs=True)
             observed = fileio.read_file(filename, mode='rb')
             self.assertEqual(observed, expected)
Example #12
0
 def test_read_write_file_lines_bytes(self):
     expected = [unicode_to_bytes(sent.text) for sent in self.spacy_doc.sents]
     for ext in ('.txt', '.gz', '.bz2', '.xz'):
         filename = os.path.join(
             self.tempdir, 'test_read_write_file_lines_bytes' + ext)
         if PY2 is True and ext == '.xz':
             self.assertRaises(
                 ValueError, fileio.open_sesame,
                 filename, 'wb', 'utf-8', True)
         else:
             fileio.write_file_lines(expected, filename, mode='wb',
                                     auto_make_dirs=True)
             observed = [line.strip() for line
                         in fileio.read_file_lines(filename, mode='rb')]
             self.assertEqual(observed, expected)
Example #13
0
def write_file_lines(lines,
                     filepath,
                     mode='wt',
                     encoding=None,
                     auto_make_dirs=False):
    """
    Write the content in ``lines`` to disk at ``filepath``, line by line. Files
    with appropriate extensions are compressed with gzip or bz2 automatically.
    Any intermediate folders not found on disk may automatically be created.
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath,
                     mode=mode,
                     encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for line in lines:
            f.write(line + newline)
Example #14
0
def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    if PY2:
        is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text), bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
    if is_reliable is False:
        msg = 'Text language detected with low confidence; best guesses: %s'
        logger.warning(msg, best_guesses)
    return best_guesses[0][1]
Example #15
0
def test_read_write_text_lines_bytes(tmpdir, spacy_doc):
    expected = [compat.unicode_to_bytes(sent.text) for sent in spacy_doc.sents]
    for ext in (".txt", ".gz", ".bz2", ".xz"):
        filename = str(tmpdir.join("test_read_write_file_lines_bytes" + ext))
        if compat.is_python2 is True and ext == ".xz":
            with pytest.raises(ValueError):
                io.open_sesame(filename,
                               mode="wb",
                               encoding="utf-8",
                               make_dirs=True)
        else:
            io.write_text(expected,
                          filename,
                          mode="wb",
                          make_dirs=True,
                          lines=True)
            observed = [
                line.strip()
                for line in io.read_text(filename, mode="rb", lines=True)
            ]
            assert observed == expected
Example #16
0
def detect_language(text):
    """
    Detect the most likely language of a text and return its 2-letter code
    (see https://cloud.google.com/translate/v2/using_rest#language-params).
    Uses the `cld2-cffi <https://pypi.python.org/pypi/cld2-cffi>`_ package;
    to take advantage of optional params, call :func:`cld2.detect()` directly.

    Args:
        text (str)

    Returns:
        str
    """
    if PY2:
        is_reliable, _, best_guesses = cld2_detect(unicode_to_bytes(text), bestEffort=True)
    else:
        is_reliable, _, best_guesses = cld2_detect(text, bestEffort=True)
    if is_reliable is False:
        msg = 'Text language detected with low confidence; best guesses: %s'
        logger.warning(msg, best_guesses)
    return best_guesses[0][1]
Example #17
0
def write_json_lines(json_objects,
                     filepath,
                     mode='wt',
                     encoding=None,
                     auto_make_dirs=False,
                     ensure_ascii=False,
                     separators=(',', ':'),
                     sort_keys=False):
    """
    Iterate over a stream of JSON objects, writing each to a separate line in
    file ``filepath`` but without a top-level JSON object (e.g. array).

    Args:
        json_objects (iterable[json]): iterable of valid JSON objects to be written
        filepath (str): /path/to/file on disk to which JSON objects will be written,
            where each line in the file is its own json object; for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n
                {"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str)
        encoding (str)
        auto_make_dirs (bool)
        ensure_ascii (bool)
        separators (tuple[str])
        sort_keys (bool)

    .. seealso:: https://docs.python.org/3/library/json.html#json.dump
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath,
                     mode=mode,
                     encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for json_object in json_objects:
            f.write(
                json.dumps(json_object,
                           ensure_ascii=ensure_ascii,
                           separators=separators,
                           sort_keys=sort_keys) + newline)