Python ArabicReshaper Examples, arabic_reshaper.ArabicReshaper Python Examples

Example #1

0

Show file

File: generator.py Project: rahmaatallah/edraak-platform

def text_to_bidi(text):
    text = normalize_spaces(text)
    configured_reshaper = ArabicReshaper(
        configuration={'use_unshaped_instead_of_isolated': True})
    reshaped_text = configured_reshaper.reshape(text)
    bidi_text = get_display(reshaped_text)
    return bidi_text

Example #2

0

Show file

File: pdf.py Project: 17702296834/pretix

    def _draw_textarea(self, canvas: Canvas, op: OrderPosition, order: Order,
                       o: dict):
        font = o['fontfamily']
        if o['bold']:
            font += ' B'
        if o['italic']:
            font += ' I'

        align_map = {'left': TA_LEFT, 'center': TA_CENTER, 'right': TA_RIGHT}
        style = ParagraphStyle(name=uuid.uuid4().hex,
                               fontName=font,
                               fontSize=float(o['fontsize']),
                               leading=float(o['fontsize']),
                               autoLeading="max",
                               textColor=Color(o['color'][0] / 255,
                                               o['color'][1] / 255,
                                               o['color'][2] / 255),
                               alignment=align_map[o['align']])
        text = conditional_escape(
            self._get_text_content(op, order, o)
            or "", ).replace("\n", "<br/>\n")

        # reportlab does not support RTL, ligature-heavy scripts like Arabic. Therefore, we use ArabicReshaper
        # to resolve all ligatures and python-bidi to switch RTL texts.
        configuration = {
            'delete_harakat': True,
            'support_ligatures': False,
        }
        reshaper = ArabicReshaper(configuration=configuration)
        try:
            text = "<br/>".join(
                get_display(reshaper.reshape(l)) for l in text.split("<br/>"))
        except:
            logger.exception('Reshaping/Bidi fixes failed on string {}'.format(
                repr(text)))

        p = Paragraph(text, style=style)
        w, h = p.wrapOn(canvas, float(o['width']) * mm, 1000 * mm)
        # p_size = p.wrap(float(o['width']) * mm, 1000 * mm)
        ad = getAscentDescent(font, float(o['fontsize']))
        canvas.saveState()
        # The ascent/descent offsets here are not really proven to be correct, they're just empirical values to get
        # reportlab render similarly to browser canvas.
        if o.get('downward', False):
            canvas.translate(float(o['left']) * mm, float(o['bottom']) * mm)
            canvas.rotate(o.get('rotation', 0) * -1)
            p.drawOn(canvas, 0, -h - ad[1] / 2)
        else:
            canvas.translate(
                float(o['left']) * mm,
                float(o['bottom']) * mm + h)
            canvas.rotate(o.get('rotation', 0) * -1)
            p.drawOn(canvas, 0, -h - ad[1])
        canvas.restoreState()

Example #3

0

Show file

    def _draw_textarea(self, canvas: Canvas, op: OrderPosition, order: Order,
                       o: dict):
        font = o['fontfamily']
        if o['bold']:
            font += ' B'
        if o['italic']:
            font += ' I'

        align_map = {'left': TA_LEFT, 'center': TA_CENTER, 'right': TA_RIGHT}
        style = ParagraphStyle(name=uuid.uuid4().hex,
                               fontName=font,
                               fontSize=float(o['fontsize']),
                               leading=float(o['fontsize']),
                               autoLeading="max",
                               textColor=Color(o['color'][0] / 255,
                                               o['color'][1] / 255,
                                               o['color'][2] / 255),
                               alignment=align_map[o['align']])
        text = re.sub(
            "<br[^>]*>", "<br/>",
            bleach.clean(self._get_text_content(op, order, o) or "",
                         tags=["br"],
                         attributes={},
                         styles=[],
                         strip=True))

        # reportlab does not support RTL, ligature-heavy scripts like Arabic. Therefore, we use ArabicReshaper
        # to resolve all ligatures and python-bidi to switch RTL texts.
        configuration = {
            'delete_harakat': True,
            'support_ligatures': False,
        }
        reshaper = ArabicReshaper(configuration=configuration)
        text = "<br/>".join(
            get_display(reshaper.reshape(l)) for l in text.split("<br/>"))

        p = Paragraph(text, style=style)
        p.wrapOn(canvas, float(o['width']) * mm, 1000 * mm)
        # p_size = p.wrap(float(o['width']) * mm, 1000 * mm)
        ad = getAscentDescent(font, float(o['fontsize']))
        p.drawOn(canvas,
                 float(o['left']) * mm,
                 float(o['bottom']) * mm - ad[1])

Example #4

0

Show file

def main():
    """
        Description: Main function
    """

    # Argument parsing
    args = parse_arguments()

    # Create the directory if it does not exist.
    try:
        os.makedirs(args.output_dir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    # Creating word list
    if args.dict:
        lang_dict = []
        if os.path.isfile(args.dict):
            with open(args.dict, "r", encoding="utf8", errors="ignore") as d:
                lang_dict = [l for l in d.read().splitlines() if len(l) > 0]
        else:
            sys.exit("Cannot open dict")
    else:
        lang_dict = load_dict(args.language)
    with open('./trdg/dicts/en.txt', 'r', encoding='utf8',
              errors='ignore') as f:
        en_dict = [i for i in f.read().splitlines() if len(i) > 0]
    # Create font (path) list
    if args.font_dir:
        fonts = [
            os.path.join(args.font_dir, p) for p in os.listdir(args.font_dir)
            if os.path.splitext(p)[1] == ".ttf"
        ]
    elif args.font:
        if os.path.isfile(args.font):
            fonts = [args.font]
        else:
            sys.exit("Cannot open font")
    else:
        fonts = load_fonts(args.language)

    # Creating synthetic sentences (or word)
    strings = []

    if args.use_wikipedia:
        print('use_wikipedia')
        strings = create_strings_from_wikipedia(args.length, args.count,
                                                args.language)
    elif args.input_file != "":
        print('input_file')
        strings = create_strings_from_file(args.input_file, args.count)
    elif args.random_sequences:
        print('random_sequences')
        strings = create_strings_randomly(
            args.length,
            args.random,
            args.count,
            args.include_letters,
            args.include_numbers,
            args.include_symbols,
            args.language,
        )
        # Set a name format compatible with special characters automatically if they are used
        if args.include_symbols or True not in (
                args.include_letters,
                args.include_numbers,
                args.include_symbols,
        ):
            args.name_format = 2


#     else :
#         print('create_strings_from_dict')
#         strings = create_strings_from_dict(
#             args.length,
#             args.random,
#             args.count,
#             lang_dict
#             )
    else:
        print(make_my_strings)
        strings = make_my_strings(
            args.count,
            lang_dict,
            en_dict,
        )

    if args.language == "ar":
        from arabic_reshaper import ArabicReshaper

        arabic_reshaper = ArabicReshaper()
        strings = [
            " ".join([arabic_reshaper.reshape(w) for w in s.split(" ")[::-1]])
            for s in strings
        ]
    if args.case == "upper":
        strings = [x.upper() for x in strings]
    if args.case == "lower":
        strings = [x.lower() for x in strings]

    string_count = len(strings)

    p = Pool(args.thread_count)
    for _ in tqdm(
            p.imap_unordered(
                FakeTextDataGenerator.generate_from_tuple,
                zip(
                    [i for i in range(0, string_count)],
                    strings,
                    [
                        fonts[rnd.randrange(0, len(fonts))]
                        for _ in range(0, string_count)
                    ],
                    [args.output_dir] * string_count,
                    [args.format] * string_count,
                    [args.extension] * string_count,
                    [args.skew_angle] * string_count,
                    [args.random_skew] * string_count,
                    [args.blur] * string_count,
                    [args.random_blur] * string_count,
                    [args.background] * string_count,
                    [args.distorsion] * string_count,
                    [args.distorsion_orientation] * string_count,
                    [args.handwritten] * string_count,
                    [args.name_format] * string_count,
                    [args.width] * string_count,
                    [args.alignment] * string_count,
                    [args.text_color] * string_count,
                    [args.orientation] * string_count,
                    [args.space_width] * string_count,
                    [args.character_spacing] * string_count,
                    [args.margins] * string_count,
                    [args.fit] * string_count,
                    [args.output_mask] * string_count,
                    [args.word_split] * string_count,
                    [args.image_dir] * string_count,
                ),
            ),
            total=args.count,
    ):
        pass
    p.terminate()

    if args.name_format == 2:
        # Create file with filename-to-label connections
        with open(os.path.join(args.output_dir, "labels.txt"),
                  "w",
                  encoding="utf8") as f:
            for i in range(string_count):
                file_name = str(i) + "." + args.extension
                f.write("{}\t{}\n".format(file_name, strings[i]))

Example #5

0

Show file

    def __init__(self,
                 win,
                 text="Hello World",
                 font="",
                 pos=(0.0, 0.0),
                 depth=0,
                 rgb=None,
                 color=(1.0, 1.0, 1.0),
                 colorSpace='rgb',
                 opacity=1.0,
                 contrast=1.0,
                 units="",
                 ori=0.0,
                 height=None,
                 antialias=True,
                 bold=False,
                 italic=False,
                 alignHoriz=None,
                 alignVert=None,
                 alignText='center',
                 anchorHoriz='center',
                 anchorVert='center',
                 fontFiles=(),
                 wrapWidth=None,
                 flipHoriz=False,
                 flipVert=False,
                 languageStyle='LTR',
                 name=None,
                 autoLog=None,
                 autoDraw=False):
        """
        **Performance OBS:** in general, TextStim is slower than many other
        visual stimuli, i.e. it takes longer to change some attributes.
        In general, it's the attributes that affect the shapes of the letters:
        ``text``, ``height``, ``font``, ``bold`` etc.
        These make the next .draw() slower because that sets the text again.
        You can make the draw() quick by calling re-setting the text
        (``myTextStim.text = myTextStim.text``) when you've changed the
        parameters.

        In general, other attributes which merely affect the presentation of
        unchanged shapes are as fast as usual. This includes ``pos``,
        ``opacity`` etc.

        The following attribute can only be set at initialization (see
        further down for a list of attributes which can be changed after
        initialization):

        **languageStyle**
            Apply settings to correctly display content from some languages
            that are written right-to-left. Currently there are three (case-
            insensitive) values for this parameter:

            - ``'LTR'`` is the default, for typical left-to-right, Latin-style
                languages.
            - ``'RTL'`` will correctly display text in right-to-left languages
                such as Hebrew. By applying the bidirectional algorithm, it
                allows mixing portions of left-to-right content (such as numbers
                or Latin script) within the string.
            - ``'Arabic'`` applies the bidirectional algorithm but additionally
                will _reshape_ Arabic characters so they appear in the cursive,
                linked form that depends on neighbouring characters, rather than
                in their isolated form. May also be applied in other scripts,
                such as Farsi or Urdu, that use Arabic-style alphabets.

        :Parameters:

        """

        # what local vars are defined (these are the init params) for use by
        # __repr__
        self._initParams = dir()
        self._initParams.remove('self')
        """
        October 2018:
            In place to remove the deprecation warning for pyglet.font.Text.
            Temporary fix until pyglet.text.Label use is identical to pyglet.font.Text.
        """
        warnings.filterwarnings(message='.*text.Label*', action='ignore')

        super(TextStim, self).__init__(win,
                                       units=units,
                                       name=name,
                                       autoLog=False)

        if win.blendMode == 'add':
            logging.warning("Pyglet text does not honor the Window setting "
                            "`blendMode='add'` so 'avg' will be used for the "
                            "text (but objects drawn after can be added)")
        self._needUpdate = True
        self._needVertexUpdate = True
        # use shaders if available by default, this is a good thing
        self.__dict__['antialias'] = antialias
        self.__dict__['font'] = font
        self.__dict__['bold'] = bold
        self.__dict__['italic'] = italic
        # NB just a placeholder - real value set below
        self.__dict__['text'] = ''
        self.__dict__['depth'] = depth
        self.__dict__['ori'] = ori
        self.__dict__['flipHoriz'] = flipHoriz
        self.__dict__['flipVert'] = flipVert
        self.__dict__['languageStyle'] = languageStyle
        if languageStyle.lower() == 'arabic':
            arabic_config = {
                'delete_harakat': False,  # if present, retain any diacritics
                'shift_harakat_position': True
            }  # shift by 1 to be compatible with the bidi algorithm
            self.__dict__['arabic_reshaper'] = ArabicReshaper(
                configuration=arabic_config)
        self._pygletTextObj = None
        self.pos = pos
        # deprecated attributes
        if alignVert:
            self.__dict__['alignVert'] = alignVert
            logging.warning("TextStim.alignVert is deprecated. Use the "
                            "anchorVert attribute instead")
            # for compatibility, alignText was historically 'left'
            anchorVert = alignHoriz
        if alignHoriz:
            self.__dict__['alignHoriz'] = alignHoriz
            logging.warning("TextStim.alignHoriz is deprecated. Use alignText "
                            "and anchorHoriz attributes instead")
            # for compatibility, alignText was historically 'left'
            alignText, anchorHoriz = alignHoriz, alignHoriz
        # alignment and anchors
        self.alignText = alignText
        self.anchorHoriz = anchorHoriz
        self.anchorVert = anchorVert

        # generate the texture and list holders
        self._listID = GL.glGenLists(1)
        # pygame text needs a surface to render to:
        if not self.win.winType in ["pyglet", "glfw"]:
            self._texID = GL.GLuint()
            GL.glGenTextures(1, ctypes.byref(self._texID))

        # Color stuff
        self.colorSpace = colorSpace
        self.color = color
        if rgb != None:
            logging.warning(
                "Use of rgb arguments to stimuli are deprecated. Please "
                "use color and colorSpace args instead")
            self.color = Color(rgb, 'rgb')
        self.__dict__['fontFiles'] = []
        self.fontFiles = list(fontFiles)  # calls attributeSetter
        self.setHeight(height, log=False)  # calls setFont() at some point
        # calls attributeSetter without log
        setAttribute(self, 'wrapWidth', wrapWidth, log=False)
        self.opacity = opacity
        self.contrast = contrast
        # self.width and self._fontHeightPix get set with text and
        # calcSizeRendered is called
        self.setText(text, log=False)
        self._needUpdate = True

        self.autoDraw = autoDraw

        # set autoLog now that params have been initialised
        wantLog = autoLog is None and self.win.autoLog
        self.__dict__['autoLog'] = autoLog or wantLog
        if self.autoLog:
            logging.exp("Created %s = %s" % (self.name, str(self)))

Example #6

0

Show file

File: buatPdf.py Project: candragati/bot-timer-telegram

def replace_with_emoji_pdf(text, size):
    """
    Reportlab's Paragraph doesn't accept normal html <image> tag's attributes
    like 'class', 'alt'. Its a little hack to remove those attrbs
    """

    text = Emoji.to_image(text)
    text = text.replace('class="emojione"', 'height=%s width=%s' %(size, size))
    return re.sub('alt="'+Emoji.shortcode_regexp+'"', '', text)

configuration = {
    'delete_harakat': False,
    'support_ligatures': False,    
}
reshaper = ArabicReshaper(configuration=configuration)

class PageNumCanvas(canvas.Canvas):
    def __init__(self, *args, **kwargs):
        canvas.Canvas.__init__(self, *args, **kwargs)
        self.pages = []
        
    def showPage(self):
        self.pages.append(dict(self.__dict__))
        self._startPage()
    
    def save(self):
        page_count = len(self.pages)       
        for page in self.pages:
            self.__dict__.update(page)
            self.draw_page_number(page_count)

Example #7

0

Show file

import spacy
import matplotlib.pyplot as plt
from urduhack import normalize
from arabic_reshaper import ArabicReshaper
from bidi.algorithm import get_display
from urduhack import stop_words,normalization

d = "F:\\Current Semester\\FYP\\OASRU_CLEN\\OASRU\\ResultScripts"

configuration = {
    'delete_harakat': False,
    'support_ligatures': True,
    'RIAL SIGN': True,  # Replace ر ي ا ل with ﷼
}

reshaper = ArabicReshaper(configuration=configuration)
text_to_be_reshaped = "ترجمان"
text_to_be_reshaped = normalize(text_to_be_reshaped)
text_to_be_reshaped = normalization.normalize_characters(text_to_be_reshaped)
text_to_be_reshaped = normalization.normalize_combine_characters(text_to_be_reshaped)
text_to_be_reshaped = normalization.punctuations_space(text_to_be_reshaped)
nlp = spacy.blank("ur")
reshaped_text = reshaper.reshape(text_to_be_reshaped)
doc = nlp(text_to_be_reshaped)
text = []

for each in doc:
    if str(each) not in str(stop_words.STOP_WORDS):
        #(each)
        text.append(str(each))
reshaped_text = ""

Example #8

0

Show file

import arabic_reshaper

text_to_be_reshaped = 'اللغة العربية رائعة'
reshaped_text = arabic_reshaper.reshape(text_to_be_reshaped)
print(reshaped_text)

from arabic_reshaper import ArabicReshaper
configuration = {
    'delete_harakat': False,
    'support_ligatures': True,
    'RIAL SIGN': True,  # Replace ر ي ا ل with ﷼
}
reshaper = ArabicReshaper(configuration=configuration)
l1='ل'
l2='ا'
l3='ر'
l4='ي'
text_to_be_reshaped = 'ب ﺭ ﻱ ﺕ' # had to split the string for display
reshaped_text = reshaper.reshape(text_to_be_reshaped.replace(' ',''))
print(reshaped_text)

Example #9

0

Show file

File: pdf.py Project: Janfred/pretix

    return v


def get_first_scan(op: OrderPosition):
    scans = list(op.checkins.all())

    if scans:
        return date_format(
            list(op.checkins.all())[-1].datetime.astimezone(
                op.order.event.timezone), "SHORT_DATETIME_FORMAT")
    return ""


reshaper = SimpleLazyObject(
    lambda: ArabicReshaper(configuration={
        'delete_harakat': True,
        'support_ligatures': False,
    }))


class Renderer:
    def __init__(self, event, layout, background_file):
        self.layout = layout
        self.background_file = background_file
        self.variables = get_variables(event)
        self.images = get_images(event)
        self.event = event
        if self.background_file:
            self.bg_bytes = self.background_file.read()
            self.bg_pdf = PdfFileReader(BytesIO(self.bg_bytes), strict=False)
        else:
            self.bg_bytes = None

Example #10

0

Show file

File: MyWordCloudGenerator.py Project: mh13159/tarjumaan

def MyWordCloudGen(imgpath, scriptpath, os):

    # d = "F:\\Current Semester\\FYP\\OASRU_CLEN\\OASRU\\ResultScripts"
    configuration = {
        'delete_harakat': False,
        'support_ligatures': True,
        'RIAL SIGN': True,  # Replace ر ي ا ل with ﷼
    }
    reshaper = ArabicReshaper(configuration=configuration)
    scripts = os.listdir(scriptpath)
    scripts.sort(key=lambda x: os.stat(os.path.join(scriptpath, x)).st_mtime)
    print((scripts))

    text_to_be_reshaped = open(path.join(scriptpath, scripts[1]),
                               encoding="UTF-8").read()
    print(text_to_be_reshaped)
    text_to_be_reshaped = normalize(text_to_be_reshaped)
    text_to_be_reshaped = normalization.normalize_characters(
        text_to_be_reshaped)
    text_to_be_reshaped = normalization.normalize_combine_characters(
        text_to_be_reshaped)
    text_to_be_reshaped = normalization.punctuations_space(text_to_be_reshaped)
    nlp = spacy.blank("ur")
    reshaped_text = reshaper.reshape(text_to_be_reshaped)
    doc = nlp(text_to_be_reshaped)
    text = []

    for each in doc:
        if str(each) not in str(stop_words.STOP_WORDS):
            #(each)
            text.append(str(each))
    reshaped_text = ""

    for each in text:
        reshaped_text = reshaped_text + " " + each

    reshaped_text = reshaper.reshape(reshaped_text)

    from bidi.algorithm import get_display
    bidi_text = get_display(reshaped_text)
    fontdir = "D:\\tarjumaan-master\\Urdu_fonts\\"
    import os
    plt.figure(figsize=(20, 15), dpi=200)
    wordcloud = WordCloud(os.getcwd() + "\\Urdu_fonts\\" +
                          "DecoType Thuluth.ttf",
                          width=2000,
                          height=1500,
                          include_numbers=True,
                          stopwords=stop_words.STOP_WORDS,
                          min_font_size=30,
                          background_color="black",
                          margin=0,
                          max_words=200).generate(bidi_text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.savefig(imgpath + "\\image.png", format="png")
    plt.show()

    img = imgpath + "\\" + "image.png"
    print(img)
    print("Relative Path", os.path.relpath(img))
    img = os.path.relpath(img)
    return img