def test_init_default_valid2(self): """Test that a Unicode string type for default doesn't raise an Exception. """ assert CharMapper({}, 'Hello')
def test_init_default_not_valid1(self): """Test that an invalid type (list) for default raises a TypeError. """ with pytest.raises(TypeError): CharMapper({}, [])
def test_init_default_valid1(self): """Test that a None type for default doesn't raise an Exception. """ assert CharMapper({}, None)
def test_builtinmapper_xmlbw2hsb(self): """Test that the builtin 'xmlbw2hsb' scheme is loaded without errors. """ assert CharMapper.builtin_mapper('xmlbw2hsb')
def __init__(self) -> None: super().__init__() self.sec_cleaner = None self.clean_mapper = CharMapper.builtin_mapper('arclean') self.ar2bw_mapper = CharMapper.builtin_mapper('ar2bw') self.bw2ar_mapper = CharMapper.builtin_mapper('bw2ar')
def test_mapstring_english(self): """Test that a map_string properly maps an English unicode string. """ mapper = CharMapper(VALID_MAP) assert mapper.map_string('Hello, world!') == 'Hu**o, wor*m!'
def test_builtinmapper_bw2ar(self): """Test that the builtin 'bw2ar' scheme is loaded without errors. """ assert CharMapper.builtin_mapper('bw2ar')
def test_init_charmap_valid3(self): """Test that a valid charMap doesn't raise an Exception. """ assert CharMapper({u'a-f': u''})
def test_builtinmapper_arclean(self): """Test that the builtin 'arclean' scheme is loaded without errors. """ assert CharMapper.builtin_mapper('arclean')
def main(): # pragma: no cover try: version = ('CAMeL Tools v{}'.format(__version__)) arguments = docopt(__doc__, version=version) if arguments['--list']: for scheme in _BUILTIN_SCHEMES: print("{} {}".format(scheme[0].ljust(20), scheme[1])) sys.exit(0) if arguments['--scheme'] is not None: if arguments['--scheme'] not in [s[0] for s in _BUILTIN_SCHEMES]: sys.stderr.write('Error: {} is not a valid scheme.\n' 'Run `camel_transliterate -l` to see the list' ' of available schemes.' '\n'.format(repr(arguments['--scheme']))) sys.exit(1) if arguments['--marker'] is None: marker = '@@IGNORE@@' else: marker = arguments['--marker'] ignore_markers = arguments['--ignore-markers'] strip_markers = arguments['--strip-markers'] # Open files (or just use stdin and stdout) fin, fout = _open_files(arguments['FILE'], arguments['--output']) # Load the CharMapper and initialize a Transliterator with it try: mapper = CharMapper.builtin_mapper(arguments['--scheme']) trans = Transliterator(mapper, marker) except Exception: # pylint: disable=W0703 sys.stderr.write('Error: Could not load builtin scheme' ' {}.\n'.format(repr(arguments['--scheme']))) sys.exit(1) # Transliterate lines try: for line in fin: line = force_unicode(line) if six.PY3: fout.write( trans.transliterate(line, strip_markers, ignore_markers)) else: fout.write( force_encoding( trans.transliterate(line, strip_markers, ignore_markers))) fout.flush() # If everything worked so far, this shouldn't happen except Exception: # pylint: disable=W0703 sys.stderr.write('Error: An unkown error occured during ' 'transliteration.\n') sys.exit(1) # Cleanup if arguments['FILE'] is not None: fin.close() if arguments['--output'] is not None: fout.close() sys.exit(0) except KeyboardInterrupt: sys.stderr.write('Exiting...\n') sys.exit(1) except Exception: sys.stderr.write('Error: An unknown error occurred.\n') sys.exit(1)
def test_init_charmap_valid2(self): """Test that a valid charMap doesn't raise an Exception. """ assert CharMapper({u'a': None})
# Identify No Analysis marker _NOAN_RE = re.compile(u'NOAN') _COPY_FEATS = frozenset([ 'gloss', 'atbtok', 'atbseg', 'd1tok', 'd1seg', 'd2tok', 'd2seg', 'd3tok', 'd3seg' ]) _UNDEFINED_LEX_FEATS = frozenset(['root', 'pattern', 'caphi']) DEFAULT_NORMALIZE_MAP = CharMapper({ u'\u0625': u'\u0627', u'\u0623': u'\u0627', u'\u0622': u'\u0627', u'\u0671': u'\u0627', u'\u0649': u'\u064a', u'\u0629': u'\u0647', u'\u0640': u'' }) """:obj:`~camel_tools.utils.charmap.CharMapper`: The default character map used for normalization by :obj:`CalimaStarAnalyzer`. Removes the tatweel/kashida character and does the following conversions: - 'إ' to 'ا' - 'أ' to 'ا' - 'آ' to 'ا' - 'ٱ' to 'ا' - 'ى' to 'ي' - 'ة' to 'ه'
from __future__ import absolute_import import pytest from camel_tools.utils.charmap import CharMapper from camel_tools.utils.transliterate import Transliterator # A mapper that translates lower-case English characters to a lower-case x and # upper-case English characters to an upper-case X. This makes it easy to # predict what the transliteration should be. TEST_MAP = { u'A-Z': u'X', u'a-z': u'x', } TEST_MAPPER = CharMapper(TEST_MAP, None) class TestTransliteratorInit(object): """Test class for Transliterator.__init__. """ def test_init_none_mapper(self): """Test that init raises a TypeError when given a mapper that is None. """ with pytest.raises(TypeError): Transliterator(None) def test_init_invalid_type_mapper(self): """Test that init raises a TypeError when given a mapper that is not a CharMapper instance.
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import subprocess import sys import os import argparse import time from helpers.preprocess import preprocess from helpers.tag import tag from ai.tests.mle import train_mle, predict_mle from camel_tools.utils.charmap import CharMapper ar2bw = CharMapper.builtin_mapper('ar2bw') def is_bool(s): return str(s) != 'False' parser = argparse.ArgumentParser( description= 'This program rewrites (transliterates) from one language script to another' ) # --model_name can take values "mle", "word2word", "line2line", or "hybrid" parser.add_argument('--model_name', action="store", dest='model_name',
def test_init_charmap_valid4(self): """Test that a valid charMap doesn't raise an Exception. """ assert CharMapper({'a-f': '', 'b': None}, 'Hello')
def test_init_none(self): """Test that init with None raises a TypeError. """ with pytest.raises(TypeError): CharMapper(None)
def test_init_charmap_valid5(self): """Test that a valid charMap doesn't raise an Exception. """ assert CharMapper({'--a': ''})
def test_init_empty_dict(self): """Test that init with an empty dict doesn't raise an exception. """ assert CharMapper({})
def test_mapstring_arabic(self): """Test that a map_string properly maps an Arabic unicode string. """ mapper = CharMapper(VALID_MAP) assert mapper.map_string('٠١٢٣٤٥٦٧٨٩') == '012---++++'
def test_init_dictlike_object(self): """Test that init with an dict-like object doesn't raise an exception. """ assert CharMapper(AnotherMapping())
def test_builtinmapper_safebw2bw(self): """Test that the builtin 'safebw2bw' scheme is loaded without errors. """ assert CharMapper.builtin_mapper('safebw2bw')
def test_init_not_dict(self): """Test that a non-dict object (list) raises a TypeError. """ with pytest.raises(TypeError): CharMapper([])
def test_builtinmapper_hsb2xmlbw(self): """Test that the builtin 'hsb2xmlbw' scheme is loaded without errors. """ assert CharMapper.builtin_mapper('hsb2xmlbw')
"""This module provides functions for normalizing Arabic text. """ import re import unicodedata from camel_tools.utils.charmap import CharMapper _ALEF_NORMALIZE_BW_RE = re.compile(u'[<>{|]') _ALEF_NORMALIZE_SAFEBW_RE = re.compile(u'[IOLM]') _ALEF_NORMALIZE_XMLBW_RE = re.compile(u'[IO{|]') _ALEF_NORMALIZE_HSB_RE = re.compile(u'[\u0102\u00c2\u00c4\u0100]') _ALEF_NORMALIZE_AR_RE = re.compile(u'[\u0625\u0623\u0671\u0622]') _UNICODE_CHAR_FIX = CharMapper({ '\ufdfc': 'ريال', '\ufdfd': 'بسم الله الرحمن الرحيم', }) def normalize_unicode(s, compatibility=True): """Normalize Unicode strings into their canonically composed form or (i.e. characters that can be written as a combination of unicode characters are converted to their single character form). Note: This is essentially a call to :func:`unicodedata.normalize` with form 'NFC' if **compatibility** is False or 'NFKC' if it's True. Args: s (:obj:`str`): The string to be normalized. compatibility (:obj:`bool`, optional): Apply compatibility decomposition. Defaults to True.