Beispiel #1
0
    def __init__(self, model_name, keep_emojis=False, remove_html_markup=True, replace_urls_emails_mentions=True):
        """
        model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Current accepted models are:

            - :obj:`"bert-base-arabertv01"`: No farasa segmentation.
            - :obj:`"bert-base-arabert"`: with farasa segmentation.
            - :obj:`"bert-base-arabertv02"`: No farasas egmentation.
            - :obj:`"bert-base-arabertv2"`: with farasa segmentation.
            - :obj:`"bert-large-arabertv02"`: No farasas egmentation.
            - :obj:`"bert-large-arabertv2"`: with farasa segmentation.
            - :obj:`"araelectra-base"`: No farasa segmentation.
            - :obj:`"aragpt2-base"`: No farasa segmentation.
            - :obj:`"aragpt2-medium"`: No farasa segmentation.
            - :obj:`"aragpt2-large"`: No farasa segmentation.
            - :obj:`"aragpt2-mega"`: No farasa segmentation.

        keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False

        remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True

        remove_html_markup(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True
        """
        model_name = model_name.replace("aubmindlab/","")

        if model_name not in ACCEPTED_MODELS:
            logging.warning(
                "Model provided is not in the accepted model list. Assuming you don't want Farasa Segmentation"
            )
            self.model_name = "bert-base-arabertv02"
        else:
            self.model_name = model_name

        if self.model_name in SEGMENTED_MODELS:
            logging.info("Selected Model requires pre-segmentation, Initializing FarasaSegmenter")
            try:
                from farasa.segmenter import FarasaSegmenter
                self.farasa_segmenter = FarasaSegmenter(interactive=True)
            except:
                logging.warning("farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy")
        else:
            logging.info("Selected Model doesn't require pre-segmentation, skipping FarasaSegmenter initialization")

        self.keep_emojis = keep_emojis
        if self.keep_emojis:
            import emoji
            self.emoji = emoji
            if self.model_name in SEGMENTED_MODELS:
                logging.warning("Keeping tweets with Farasa Segmentation is 10 times slower")

        self.remove_html_markup = remove_html_markup
        self.replace_urls_emails_mentions = replace_urls_emails_mentions
Beispiel #2
0
    def __init__(
        self,
        unk_token="<UNK>",
        pad_token="<PAD>",
        segment=False,
        vocab_size=10000,
        segm_token="+",
        clean=False,
        normalize=False,
    ):
        """Constructor

        Args:
            unk_token (str, optional): reserved token for unknowns. Defaults to "<UNK>".
            pad_token (str, optional): reserved token for padding. Defaults to "<PAD>".
            segment (bool, optional): segment using farasa. Defaults to False.
            max_tokens (int, optional): max number of vocabulary. Defaults to 10000.
            segm_token (str, optional): reserved token for segmentation. Defaults to '+'.
            clean (bool, optional): remove tashkeel, english and special chars. Defaults to False.
            normalize (bool, optional): normalize chars. Defaults to False.
        """
        self.segm_token = segm_token
        self.vocab_size = vocab_size
        self.unk_token = unk_token
        self.pad_token = pad_token
        self.segment = segment
        self.clean = clean
        self.normalize = normalize
        self.vocab = None  # to be filled by child classes

        # relative path
        self.rel_path = os.path.dirname(__file__)
        norm_dict_path = os.path.join(
            self.rel_path, "dictionaries/normalization_dictionary.pl")
        cach_dict_path = os.path.join(self.rel_path, "dictionaries/cached.pl")
        self.norm_dict = pickle.load(open(norm_dict_path, "rb"))
        self.cached = pickle.load(open(cach_dict_path, "rb"))

        if self.segment:
            print("Initializing Farasa")
            # suppress farasa stdout
            # WARNING: this is LINUX ONLY command!
            old_stdout = sys.stdout
            sys.stdout = open(os.devnull, "w")
            self.segmenter = FarasaSegmenter(interactive=True)
            # resume farasa stdout
            sys.stdout = old_stdout
Beispiel #3
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.do_farasa_tokenization:
        if FLAGS.use_farasapy:
            from farasa.segmenter import FarasaSegmenter

            farasa_segmenter = FarasaSegmenter(interactive=True)
        else:
            from py4j.java_gateway import JavaGateway

            gateway = JavaGateway.launch_gateway(
                classpath=FLAGS.path_to_farasa)
            farasa_segmenter = gateway.jvm.com.qcri.farasa.segmenter.Farasa()
    else:
        farasa = None

    with tf.gfile.Open(FLAGS.input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph["context"] = clean_preprocess(
                paragraph["context"],
                do_farasa_tokenization=FLAGS.do_farasa_tokenization,
                farasa=farasa_segmenter,
                use_farasapy=FLAGS.use_farasapy,
            )
            for qas in paragraph["qas"]:
                qas["question"] = clean_preprocess(
                    qas["question"],
                    do_farasa_tokenization=FLAGS.do_farasa_tokenization,
                    farasa=farasa_segmenter,
                    use_farasapy=FLAGS.use_farasapy,
                )
                qas["answers"][0]["text"] = clean_preprocess(
                    qas["answers"][0]["text"],
                    do_farasa_tokenization=FLAGS.do_farasa_tokenization,
                    farasa=farasa_segmenter,
                    use_farasapy=FLAGS.use_farasapy,
                )
                qas["answers"][0]["answer_start"] = paragraph["context"].find(
                    qas["answers"][0]["text"])
                if qas["answers"][0]["answer_start"] == -1:
                    tf.logging.warning(
                        "Could not find answer for question '%s' : '%s' vs. '%s'",
                        qas["id"],
                        paragraph["context"],
                        qas["answers"][0]["text"],
                    )

    input_data = {
        "data": input_data,
        "version": "1.1",
        "preprocess": "True",
    }
    with tf.gfile.Open(FLAGS.output_file, "w") as writer:
        json.dump(input_data, writer)
Beispiel #4
0
    def __init__(self):
        self.langs = ["zh -> ar", "ar -> zh", "ar -> fr", "fr -> ar"]
        self.model_ar2zh = TransformerModel.from_pretrained(
            "checkpoints-ar2zh",
            checkpoint_file='checkpoint_best.pt',
            data_name_or_path='data-bin',
            bpe='subword_nmt',
            bpe_codes='data-bin/code')

        self.model_zh2ar = TransformerModel.from_pretrained(
            "checkpoints-zh2ar",
            checkpoint_file='checkpoint_best.pt',
            data_name_or_path='data-bin',
            bpe='subword_nmt',
            bpe_codes='data-bin/code')

        self.segmenter = FarasaSegmenter(interactive=True)
        self.models = {"ar2zh": self.model_ar2zh, "zh2ar": self.model_zh2ar}
Beispiel #5
0
class Translator():
    def __init__(self):
        self.langs = ["zh -> ar", "ar -> zh", "ar -> fr", "fr -> ar"]
        self.model_ar2zh = TransformerModel.from_pretrained(
            "checkpoints-ar2zh",
            checkpoint_file='checkpoint_best.pt',
            data_name_or_path='data-bin',
            bpe='subword_nmt',
            bpe_codes='data-bin/code')

        self.model_zh2ar = TransformerModel.from_pretrained(
            "checkpoints-zh2ar",
            checkpoint_file='checkpoint_best.pt',
            data_name_or_path='data-bin',
            bpe='subword_nmt',
            bpe_codes='data-bin/code')

        self.segmenter = FarasaSegmenter(interactive=True)
        self.models = {"ar2zh": self.model_ar2zh, "zh2ar": self.model_zh2ar}

    def supported_languages(self):
        return self.langs

    def translate(self, src, tgt, text):
        # chinese is segmented arabic is not
        src2trg = src + "2" + tgt
        model = self.models[src2trg]
        model.cuda()
        if src == "ar":
            text = self.segment_ar(text)
            text = text.replace("+", "+ ")
        output = model.translate(text)
        if tgt == "ar":
            output = output.replace("+ ", "")
        return output

    def segment_ar(self, sent: str):
        segmented = self.segmenter.segment(sent)
        toks = segmented.split(" ")
        ret_sent = ""
        for tok in toks:
            segments = re.split("(?<=[+])", tok)
            for i in range(len(segments)):
                ret_sent = ret_sent + segments[i] + " "
        ret_sent = ret_sent.strip()

        return ret_sent
    def transform(self, sentences_list, extract_and_paste_emojies=False):
        """
        transforming data and applying all pre-processing steps over it. In case 'fit' is required, it will yiled an
        error in case data is not fitted yet

        Parameters
        ----------
        :param sentences_list: list (of arabic sentences)
            list of sentences to apply the function on. Each sentence is treated independently
        :param extract_and_paste_emojies: boolean. Default: False
            whether to handle emojies is a special way. Currently we only know how to handle emojies in a very specific
            way (extract them and then paste them at the end of the sentence - not ideal)
            TODO: handle empjies in a better way (do not convert them to ?? and "leave" them as is in the sentence)

        :return: list
            list of transformed arabic sentences. Same input list, but after the transform function has been applied
            over all of them

        """
        farasa_segmenter = FarasaSegmenter(interactive=True)
        new_sentences_list = list()
        # looping over each sentence
        for cur_text in sentences_list:
            # in case we decided to use the farase preprocess
            if self.use_default_farsa_preprocess:
                preprocessed_text = preprocess(cur_text,
                                               do_farasa_tokenization=True,
                                               farasa=farasa_segmenter,
                                               use_farasapy=True)
                preprocessed_text_as_list = preprocessed_text.split(" ")
                # removal of punctuation (e.g., '?', '!?!')
                preprocessed_text_as_list = [
                    cur_word for cur_word in preprocessed_text_as_list
                    if not all(j in string.punctuation for j in cur_word)
                ]
                if extract_and_paste_emojies:
                    emojies_found = self.extract_emojis(text=cur_text)
                    preprocessed_text_as_list.extend(emojies_found)
                new_sentences_list.append(' '.join(preprocessed_text_as_list))
            # currently not doing anything in such case, only supports the default case
            else:
                new_sentences_list.append(cur_text)
        return new_sentences_list
#%%
import pandas as pd
from sklearn.model_selection import train_test_split
from preprocess_arabert import preprocess
from tqdm import tqdm

tqdm.pandas()
import arabert
import sys

sys.path.append("arabert")
from arabert import modeling, optimization, tokenization
from arabert.run_classifier import input_fn_builder, model_fn_builder
from farasa.segmenter import FarasaSegmenter

farasa_segmenter = FarasaSegmenter(interactive=True)
# gateway = JavaGateway.launch_gateway(classpath='./PATH_TO_FARASA/FarasaSegmenterJar.jar')
# farasa = gateway.jvm.com.qcri.farasa.segmenter.Farasa()


class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
        train_InputExamples=None,
        test_InputExamples=None,
        train_features=None,
        test_features=None,
Beispiel #8
0
# https://r12a.github.io/scripts/tutorial/summaries/arabic
sample =\
'''
يُشار إلى أن اللغة العربية يتحدثها أكثر من 422 مليون نسمة ويتوزع متحدثوها في المنطقة المعروفة باسم الوطن العربي بالإضافة إلى العديد من المناطق الأخرى المجاورة مثل الأهواز وتركيا وتشاد والسنغال وإريتريا وغيرها.     وهي اللغة الرابعة من لغات منظمة الأمم المتحدة الرسمية الست.
'''
'''
---------------------
non interactive mode
---------------------
'''
print("original sample:", sample)
print('----------------------------------------')
print("Farasa features, noninteractive mode.")
print('----------------------------------------')
segmenter = FarasaSegmenter()
segmented = segmenter.segment(sample)
print("sample segmented:", segmented)
print("----------------------------------------------")

stemmer = FarasaStemmer()
stemmed = stemmer.stem(sample)
print("sample stemmed:", stemmed)
print("----------------------------------------------")

pos_tagger = FarasaPOSTagger()
pos_tagged = pos_tagger.tag(sample)
print("sample POS Tagged", pos_tagged)
print("----------------------------------------------")

named_entity_recognizer = FarasaNamedEntityRecognizer()
Beispiel #9
0
class BaseTokenizer:
    """
    Base Tokenizer that implements the basic functionalities of a tokenizer
    """
    def __init__(
        self,
        unk_token="<UNK>",
        pad_token="<PAD>",
        segment=False,
        vocab_size=10000,
        segm_token="+",
        clean=False,
        normalize=False,
    ):
        """Constructor

        Args:
            unk_token (str, optional): reserved token for unknowns. Defaults to "<UNK>".
            pad_token (str, optional): reserved token for padding. Defaults to "<PAD>".
            segment (bool, optional): segment using farasa. Defaults to False.
            max_tokens (int, optional): max number of vocabulary. Defaults to 10000.
            segm_token (str, optional): reserved token for segmentation. Defaults to '+'.
            clean (bool, optional): remove tashkeel, english and special chars. Defaults to False.
            normalize (bool, optional): normalize chars. Defaults to False.
        """
        self.segm_token = segm_token
        self.vocab_size = vocab_size
        self.unk_token = unk_token
        self.pad_token = pad_token
        self.segment = segment
        self.clean = clean
        self.normalize = normalize

        # relative path
        self.rel_path = os.path.dirname(__file__)
        norm_dict_path = os.path.join(
            self.rel_path, "dictionaries/normalization_dictionary.pl")
        cach_dict_path = os.path.join(self.rel_path, "dictionaries/cached.pl")
        self.norm_dict = pickle.load(open(norm_dict_path, "rb"))
        self.cached = pickle.load(open(cach_dict_path, "rb"))

        if self.segment:
            print("Initializing Farasa")
            # suppress farasa stdout
            # WARNING: this is LINUX ONLY command!
            old_stdout = sys.stdout
            sys.stdout = open(os.devnull, "w")
            self.segmenter = FarasaSegmenter(interactive=True)
            # resume farasa stdout
            sys.stdout = old_stdout

    def process_data(self, file_path):
        """ 
        Read, segment, clean, normalize and split

        Args:
            file_path (str): the directory of the data to read
        
        """
        with open(file_path, "r") as f:
            print("Reading the data ...")
            self.corpus = f.read()

        if self.segment:
            print("Segmenting the data ...")
            self.corpus = self.segmenter.segment(self.corpus)
            self.corpus = re.sub(r"[+]", self.segm_token, self.corpus)

        if self.clean:
            print("Cleaning the data ...")
            self.corpus = clean_data(self.corpus)

        if self.normalize:
            print("Normalizing the data ...")
            self.corpus = normalize_data(self.corpus, self.norm_dict)

        Path("data/raw").mkdir(parents=True, exist_ok=True)
        # self.train_text, self.valid_text, self.test_text = self._split_corpus()
        self._write_data("data/raw/train.txt", self.corpus)
        # self._write_data("data/raw/valid.txt", self.valid_text)
        # self._write_data("data/raw/test.txt", self.test_text)
        # del self.train_text, self.valid_text, self.test_text
        del self.corpus

    def _get_tokens_frequency_quickly(self, file_path):
        """
        Get the tokens frequency quickly using memory mapping

        Args:
            file_path (str): the directory of the data to read
        
        Returns:
            Dict: frequency based dictionary   
        """
        encoding = "utf8"
        with open(file_path, "r", encoding=encoding, errors="ignore") as f:
            with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
                m.read(0)
                i = 0
                size_to_read = int(1e9)
                freq = Counter([])
                pbar = tqdm(total=int(m.size() / size_to_read))
                while i < m.size():
                    cur_txt = ""
                    data = m.read(size_to_read)
                    i += size_to_read
                    try:
                        cur_txt = data.decode(encoding)
                    except:
                        cur_txt = (data + m.read(1)).decode(encoding)
                        i += 1
                    freq.update(cur_txt.split(" "))
                    pbar.update(1)
        return freq

    def _write_data(self, path, data):
        """
        Write the string data to a path

        Args:
            file_path (str): the directory of the data to read
        
        """
        # TOCHECK: I think this code will break if the path does not exist.
        open(path, "w").write(data)

    def _split_corpus(self):
        """
        Split the data into train, valid and test

        Returns:
            Tuple: train, valid, test
        """
        split_length = int(len(self.corpus) * 0.8)
        trainval_text, test_text = (
            self.corpus[:split_length],
            self.corpus[split_length:],
        )
        split_length = int(len(trainval_text) * 0.8)
        train_text, val_text = (
            trainval_text[:split_length],
            trainval_text[split_length:],
        )
        return train_text, val_text, test_text

    def _get_tokens_frequency(self, file_path):
        """
        Get tokens frequency using a dictionary

        Args:
            file_path (str): file path to read
        Returns:
            dict : dict containing frequency
        """
        text = open(file_path, "r").read()
        tokens_frequency = defaultdict(int)
        for word in text.split(" "):
            tokens_frequency[word] += 1
        return dict(tokens_frequency)

    def _split_word(self, word, number_of_subwords):
        """Split a word into a specific number of sub-words

        Args:
            word (str): word input
            number_of_subwords (int): number of subtokens to generate from the word 
        
        Returns:
            list: list of subwords 
        """
        assert number_of_subwords > 0

        def _split(_word, _number_of_subwords):
            groups = []
            if _number_of_subwords == 1:
                groups.append(["##" + _word])
            else:
                for i in range(1, len(_word), 1):
                    groups.extend(
                        ["##" + _word[:i], *group]
                        for group in _split(_word[i:], _number_of_subwords - 1)
                        if len(group) == _number_of_subwords - 1)
            return groups

        groups_of_subwords = _split(word, number_of_subwords)
        out_groups = []
        for group in groups_of_subwords:
            group[0] = group[0].replace("##", "")
            out_groups.append(group)
        return out_groups

    def _split_word_cached(self, word, number_of_subwords):
        """Faster version of word splitting

        Args:
            word (word): word to be split
            number_of_subwords (int): number of subwords to split the word to

        Returns:
            list: subwords
        """
        if number_of_subwords == 1:
            return [[word]]
        n = len(word) - 1
        all_binaries = self.cached[n, number_of_subwords - 1]
        return [split_on_binary(word, binary) for binary in all_binaries]

    def _tokenize_from_dict(self, text, freq_dict, cache=False):
        """Tokenize using the frequency dictionary 

        Args:
            text (str): input string

        Returns:
            list: generated tokens
        """
        assert freq_dict
        tokens = []
        output_tokens = []
        for word in text.split():
            if word in freq_dict:
                output_tokens.append(word)
            else:
                for i in range(2, len(word) + 1, 1):
                    if cache:
                        groups_of_subwords = self._split_word_cached(word, i)
                    else:
                        groups_of_subwords = self._split_word(word, i)

                    # filter out groups
                    groups_of_valid_subwords = list(
                        filter(
                            lambda group: all(subword in freq_dict.keys()
                                              for subword in group),
                            groups_of_subwords,
                        ))
                    if groups_of_valid_subwords:
                        break
                if len(groups_of_valid_subwords) == 0:
                    output_tokens.append(self.unk_token)
                else:
                    sorted_groups_of_valid_subwords = sorted(
                        groups_of_valid_subwords,
                        key=lambda group: sum(freq_dict[subword]
                                              for subword in group),
                    )
                    tokens = sorted_groups_of_valid_subwords[-1]
                    for token in tokens:
                        output_tokens.append(str(token))
        return output_tokens

    def _truncate_dict(self, freq_dict):
        """Truncate a frequency dictionary and add reserved tokens

        Args:
            freq_dict (dict): frequency dictionary

        Returns:
            dict: truncated dictionary based on the vocab size
        """
        sorted_tokens_frequency = {
            k: v
            for k, v in sorted(
                freq_dict.items(), key=lambda x: x[1], reverse=True)
        }

        limited_tokens_frequency = dict()
        limited_tokens_frequency[self.unk_token] = -1
        limited_tokens_frequency[self.pad_token] = -1
        limited_tokens_frequency.update({
            k: v
            for k, v in list(sorted_tokens_frequency.items())[:self.vocab_size]
        })
        return limited_tokens_frequency

    def encode(self, text):
        """
        Convert text to ids 
        """
        raise NotImplementedError

    def decode(self, encoded):
        """
        Convert ids to string
        """
        return NotImplementedError

    def tokenize(self, text):
        """
        Convert text to tokens
        """
        raise NotImplementedError

    def detokenize(self, tokens):
        """
        Convert tokens to text
        """
        raise NotImplementedError

    def encode_and_save(self):
        """
        Encode all the files then save as numpy
        """
        Path("data/encoded").mkdir(parents=True, exist_ok=True)
        for file_path in os.listdir("data/raw/"):
            ids = self.encode(open(f"data/raw/{file_path}", "r").read())
            np.save(f"data/encoded/{file_path[:-4]}.npy", ids)

    def encode_sentences(self, sentences, max_length=20):
        """
        Encode a list of sentences using the trained model

        Args:
            sentences (list): list of sentences
            max_length (int, optional): specify the max length of encodings. Defaults to 100.

        Returns:
            [np.array]: numpy array of encodings
        """
        encodings = []
        for sent in sentences:
            tokens = self.tokenize(sent)
            encoded = []
            for i in range(max_length):
                if i < len(tokens):
                    current_token = tokens[i]
                else:
                    current_token = self.pad_token
                encoded.append(self._tokens_list().index(current_token))
            encodings.append(encoded)
        return np.array(encodings)
Beispiel #10
0
    def __init__(
        self,
        model_name: str,
        keep_emojis: bool = False,
        remove_html_markup: bool = True,
        replace_urls_emails_mentions: bool = True,
        strip_tashkeel: bool = True,
        strip_tatweel: bool = True,
        insert_white_spaces: bool = True,
        remove_non_digit_repetition: bool = True,
        replace_slash_with_dash: bool = None,
        map_hindi_numbers_to_arabic: bool = None,
        apply_farasa_segmentation: bool = None,
    ):

        model_name = model_name.replace("Ebtihal/", "").replace("ebtihalaziz/", "")

        if model_name not in ACCEPTED_MODELS:
            logging.warning(
                """Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor"""
            )
            self.model_name = "Ebtihal/AraDiaBERTo_V2"
        else:
            self.model_name = model_name

        if apply_farasa_segmentation is None:
            if self.model_name in SEGMENTED_MODELS:
                self.apply_farasa_segmentation = True
            else:
                self.apply_farasa_segmentation = False
        else:
            if apply_farasa_segmentation == False and self.model_name in SEGMENTED_MODELS:
                logging.warning(
                    "The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!"
                )

            self.apply_farasa_segmentation = apply_farasa_segmentation

        if self.apply_farasa_segmentation:
            try:
                from farasa.segmenter import FarasaSegmenter

                self.farasa_segmenter = FarasaSegmenter(interactive=True)
            except ModuleNotFoundError:
                logging.error(
                    "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy"
                )

        self.keep_emojis = keep_emojis
        if self.keep_emojis:
            import emoji

            self.emoji = emoji
            if self.apply_farasa_segmentation:
                logging.warning(
                    "Keeping tweets with Farasa Segmentation is 10 times slower"
                )

        self.remove_html_markup = remove_html_markup
        self.replace_urls_emails_mentions = replace_urls_emails_mentions
        self.strip_tashkeel = strip_tashkeel
        self.strip_tatweel = strip_tatweel
        self.insert_white_spaces = insert_white_spaces
        self.remove_non_digit_repetition = remove_non_digit_repetition

        if replace_slash_with_dash is None:
            if self.model_name in SECOND_GEN_MODELS:
                self.replace_slash_with_dash = True
            else:
                self.replace_slash_with_dash = False
        else:
            self.replace_slash_with_dash = replace_slash_with_dash

        if map_hindi_numbers_to_arabic is None:
            if self.model_name in SECOND_GEN_MODELS:
                self.map_hindi_numbers_to_arabic = True
            else:
                self.map_hindi_numbers_to_arabic = False
        else:
            self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic
Beispiel #11
0
#%%
from farasa.segmenter import FarasaSegmenter
from tqdm import tqdm
import re
import editdistance
import pyarabic.araby as araby
from desegmentors import desegmentword
#%%
fs = FarasaSegmenter(interactive=True)

# %%
with open('data/100ksentences.csv', 'r', encoding='utf-8') as f:
    text = f.read()

# %%
all_non_arabic_characters = r"[^\u0621-\u063A\u0641-\u064A ]+"


def normalize_alef(s):
    s = s.replace(araby.ALEF_HAMZA_ABOVE, araby.ALEF)
    s = s.replace(araby.ALEF_HAMZA_BELOW, araby.ALEF)
    s = s.replace(araby.ALEF_MADDA, araby.ALEF)
    return s


# %%
#Clean and get original and segmented words
all_words = []
for line in tqdm(text.split('\n')):
    cleaned_line = normalize_alef(line)
    cleaned_line = re.sub(all_non_arabic_characters, "", cleaned_line)
Beispiel #12
0
    def __init__(
        self,
        model_name: str,
        remove_html_markup: bool = True,
        replace_urls_emails_mentions: bool = True,
        strip_tashkeel: bool = True,
        strip_tatweel: bool = True,
        insert_white_spaces: bool = True,
        remove_non_digit_repetition: bool = True,
        keep_emojis: bool = None,
        replace_slash_with_dash: bool = None,
        map_hindi_numbers_to_arabic: bool = None,
        apply_farasa_segmentation: bool = None,
    ):

        model_name = model_name.replace("aubmindlab/",
                                        "").replace("wissamantoun/", "")

        if model_name not in ACCEPTED_MODELS:
            logging.warning(
                """Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor"""
            )
            self.model_name = "bert-base-arabertv02"
        else:
            self.model_name = model_name

        if apply_farasa_segmentation is None:
            if self.model_name in SEGMENTED_MODELS:
                self.apply_farasa_segmentation = True
            else:
                self.apply_farasa_segmentation = False
        else:
            if (apply_farasa_segmentation == False
                    and self.model_name in SEGMENTED_MODELS):
                logging.warning(
                    "The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!"
                )

            self.apply_farasa_segmentation = apply_farasa_segmentation

        if self.apply_farasa_segmentation:
            try:
                from farasa.segmenter import FarasaSegmenter

                self.farasa_segmenter = FarasaSegmenter(interactive=True)
            except ModuleNotFoundError:
                logging.error(
                    "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy"
                )

        if keep_emojis is None:
            if self.model_name in TWEET_MODELS:
                self.keep_emojis = True
            else:
                self.keep_emojis = False
        else:
            if keep_emojis == False and self.model_name in TWEET_MODELS:
                logging.warning(
                    "The selected model_name is trained on emojis, but keep_emojis was set to False!"
                )
            self.keep_emojis = keep_emojis

        if self.keep_emojis:
            import emoji

            self.emoji = emoji
            if self.apply_farasa_segmentation:
                logging.warning(
                    "Keeping tweets with Farasa Segmentation is 10 times slower"
                )
            emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys()))
            self.REJECTED_CHARS_REGEX = "[^%s%s]" % (
                CHARS_REGEX
                if self.model_name in SECOND_GEN_MODELS else CHARS_REGEXV2,
                emoji_regex,
            )
        else:
            self.REJECTED_CHARS_REGEX = (REJECTED_CHARS_REGEX if
                                         self.model_name in SECOND_GEN_MODELS
                                         else REJECTED_CHARS_REGEXV2)

        self.remove_html_markup = remove_html_markup
        self.replace_urls_emails_mentions = replace_urls_emails_mentions
        self.strip_tashkeel = strip_tashkeel
        self.strip_tatweel = strip_tatweel
        self.insert_white_spaces = insert_white_spaces
        self.remove_non_digit_repetition = remove_non_digit_repetition

        if replace_slash_with_dash is None:
            if self.model_name in SECOND_GEN_MODELS:
                self.replace_slash_with_dash = True
            else:
                self.replace_slash_with_dash = False
        else:
            self.replace_slash_with_dash = replace_slash_with_dash

        if map_hindi_numbers_to_arabic is None:
            if self.model_name in SECOND_GEN_MODELS:
                self.map_hindi_numbers_to_arabic = True
            else:
                self.map_hindi_numbers_to_arabic = False
        else:
            self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic
Beispiel #13
0
class ArabertPreprocessor:
    """
    A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo.
    It also can unprocess the text ouput of the generated text

    Args:

        model_name (:obj:`str`): model name from the HuggingFace Models page without
        the aubmindlab tag. Will default to a base Arabic preprocessor if model name was not found.
        Current accepted models are:

            - "bert-base-arabertv01"
            - "bert-base-arabert"
            - "bert-base-arabertv02"
            - "bert-base-arabertv2"
            - "bert-base-arabertv02-twitter"
            - "bert-large-arabertv02"
            - "bert-large-arabertv2"
            - "bert-large-arabertv02-twitter"
            - "araelectra-base"
            - "araelectra-base-discriminator"
            - "araelectra-base-generator"
            - "araelectra-base-artydiqa"
            - "aragpt2-base"
            - "aragpt2-medium"
            - "aragpt2-large"
            - "aragpt2-mega"


        remove_html_markup(:obj: `bool`, `optional`, defaults to :obj:`True`): Whether to remove html artfacts,
        should be set to False when preprocessing TyDi QA.

        replace_urls_emails_mentions(:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to replace email urls
        and mentions by special tokens.

        strip_tashkeel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA,
        KASRA, SUKUN, SHADDA).

        strip_tatweel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove tatweel '\\u0640'.

        insert_white_spaces(:obj:`bool`, `optional`, defaults to :obj:`True`): insert whitespace before and after all non Arabic digits
        or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace
        between words and numbers or numbers and words.

        remove_non_digit_repetition(:obj:`bool`, `optional`, defaults to :obj:`True`): replace repetition of more than 2 non-digit character with
        2 of this character.

        replace_slash_with_dash(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in AraBERTv02,
        AraELECTRA and AraGPT2.
        Set to False to force disable, and True to force enable. Replaces the "/"  with "-",
        since "/" is missing from AraBERTv2, AraELECTRA and ARAGPT2 vocabulary.

        map_hindi_numbers_to_arabic(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in
        AraBERTv02, AraELECTRA and AraGPT2.Set to False to force disable, and True to force enable.
        Replaces hindi numbers with the corresponding Arabic one. ex: "١٩٩٥" --> "1995".
        This is behavior is present by default in AraBERTv1 and v2 (with pre-segmentation),
        and fixes the issue of caused by a bug when inserting white spaces.

        apply_farasa_segmentation(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in
        AraBERTv2, and AraBERTv1. Set to False to force disable, and True to force enable.

        keep_emojis(:obj:`bool`, `optional`, defaults to :obj:`None`): don't remove emojis while preprocessing.
        Will be automatically set to True in AraBERT trained on tweets.



    Returns:

        ArabertPreprocessor: A preprocessor instance

    Example:

        from preprocess import ArabertPreprocessor

        arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2")

        arabert_prep.preprocess("SOME ARABIC TEXT")
    """
    def __init__(
        self,
        model_name: str,
        remove_html_markup: bool = True,
        replace_urls_emails_mentions: bool = True,
        strip_tashkeel: bool = True,
        strip_tatweel: bool = True,
        insert_white_spaces: bool = True,
        remove_non_digit_repetition: bool = True,
        keep_emojis: bool = None,
        replace_slash_with_dash: bool = None,
        map_hindi_numbers_to_arabic: bool = None,
        apply_farasa_segmentation: bool = None,
    ):

        model_name = model_name.replace("aubmindlab/",
                                        "").replace("wissamantoun/", "")

        if model_name not in ACCEPTED_MODELS:
            logging.warning(
                """Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor"""
            )
            self.model_name = "bert-base-arabertv02"
        else:
            self.model_name = model_name

        if apply_farasa_segmentation is None:
            if self.model_name in SEGMENTED_MODELS:
                self.apply_farasa_segmentation = True
            else:
                self.apply_farasa_segmentation = False
        else:
            if (apply_farasa_segmentation == False
                    and self.model_name in SEGMENTED_MODELS):
                logging.warning(
                    "The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!"
                )

            self.apply_farasa_segmentation = apply_farasa_segmentation

        if self.apply_farasa_segmentation:
            try:
                from farasa.segmenter import FarasaSegmenter

                self.farasa_segmenter = FarasaSegmenter(interactive=True)
            except ModuleNotFoundError:
                logging.error(
                    "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy"
                )

        if keep_emojis is None:
            if self.model_name in TWEET_MODELS:
                self.keep_emojis = True
            else:
                self.keep_emojis = False
        else:
            if keep_emojis == False and self.model_name in TWEET_MODELS:
                logging.warning(
                    "The selected model_name is trained on emojis, but keep_emojis was set to False!"
                )
            self.keep_emojis = keep_emojis

        if self.keep_emojis:
            import emoji

            self.emoji = emoji
            if self.apply_farasa_segmentation:
                logging.warning(
                    "Keeping tweets with Farasa Segmentation is 10 times slower"
                )
            emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys()))
            self.REJECTED_CHARS_REGEX = "[^%s%s]" % (
                CHARS_REGEX
                if self.model_name in SECOND_GEN_MODELS else CHARS_REGEXV2,
                emoji_regex,
            )
        else:
            self.REJECTED_CHARS_REGEX = (REJECTED_CHARS_REGEX if
                                         self.model_name in SECOND_GEN_MODELS
                                         else REJECTED_CHARS_REGEXV2)

        self.remove_html_markup = remove_html_markup
        self.replace_urls_emails_mentions = replace_urls_emails_mentions
        self.strip_tashkeel = strip_tashkeel
        self.strip_tatweel = strip_tatweel
        self.insert_white_spaces = insert_white_spaces
        self.remove_non_digit_repetition = remove_non_digit_repetition

        if replace_slash_with_dash is None:
            if self.model_name in SECOND_GEN_MODELS:
                self.replace_slash_with_dash = True
            else:
                self.replace_slash_with_dash = False
        else:
            self.replace_slash_with_dash = replace_slash_with_dash

        if map_hindi_numbers_to_arabic is None:
            if self.model_name in SECOND_GEN_MODELS:
                self.map_hindi_numbers_to_arabic = True
            else:
                self.map_hindi_numbers_to_arabic = False
        else:
            self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic

    def preprocess(self, text: str) -> str:
        """
        Preprocess takes an input text line an applies the same preprocessing used in AraBERT
                            pretraining, or according to settings

        Args:

            text (:obj:`str`): inout text string

        Returns:

            string: A preprocessed string depending on which model was selected
        """
        if (self.model_name == "bert-base-arabert"
                or self.model_name == "bert-base-arabertv01"):
            return self._preprocess_v1(
                text,
                do_farasa_tokenization=self.apply_farasa_segmentation,
            )

        if self.model_name in SECOND_GEN_MODELS:
            return self._preprocess_v2(text)

        return self._preprocess_v3(text)

    def unpreprocess(self, text: str, desegment: bool = True) -> str:
        """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces.
        The objective is to make the generated text of any model appear natural and not preprocessed.

        Args:
            text (:obj:`str`): input text to be un-preprocessed
            desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before]..

        Returns:
            str: The unpreprocessed (and possibly Farasa-desegmented) text.
        """

        if self.apply_farasa_segmentation and desegment:
            text = self.desegment(text)

        # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple
        # https://stackoverflow.com/a/53436792/5381220
        text = re.sub(WHITE_SPACED_DOUBLE_QUOTATION_REGEX, '"' + r"\1" + '"',
                      text)
        text = re.sub(WHITE_SPACED_SINGLE_QUOTATION_REGEX, "'" + r"\1" + "'",
                      text)
        text = re.sub(WHITE_SPACED_BACK_QUOTATION_REGEX, "\`" + r"\1" + "\`",
                      text)
        text = re.sub(WHITE_SPACED_EM_DASH, "\—" + r"\1" + "\—", text)

        # during generation, sometimes the models don't put a space after the dot, this handles it
        text = text.replace(".", " . ")
        text = " ".join(text.split())

        # handle decimals
        text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text)
        text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text)

        text = re.sub(LEFT_AND_RIGHT_SPACED_CHARS, r"\1", text)
        text = re.sub(LEFT_SPACED_CHARS, r"\1", text)
        text = re.sub(RIGHT_SPACED_CHARS, r"\1", text)

        return text

    def desegment(self, text: str) -> str:
        """
        Use this function if sentence tokenization was done using
        `from arabert.preprocess_arabert import preprocess` with Farasa enabled
        AraBERT segmentation using Farasa adds a space after the '+' for prefixes,
        and after before the '+' for suffixes

        Example:
        >>> desegment('ال+ دراس +ات')
        الدراسات
        """
        text = text.replace("+ ", "+")
        text = text.replace(" +", "+")
        text = " ".join(
            [self._desegmentword(word) for word in text.split(" ")])
        return text

    def _desegmentword(self, orig_word: str) -> str:
        """
        Word segmentor that takes a Farasa Segmented Word and removes the '+' signs

        Example:
        >>> _desegmentword("ال+يومي+ة")
        اليومية
        """
        word = orig_word.replace("ل+ال+", "لل")
        if "ال+ال" not in orig_word:
            word = word.replace("ل+ال", "لل")
        word = word.replace("+", "")
        word = word.replace("للل", "لل")
        return word

    def _preprocess_v3(self, text: str) -> str:
        text = str(text)
        text = html.unescape(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)
        if self.strip_tatweel:
            text = araby.strip_tatweel(text)

        if self.replace_urls_emails_mentions:
            # replace all possible URLs
            for reg in URL_REGEXES:
                text = re.sub(reg, " [رابط] ", text)
            # REplace Emails with [بريد]
            for reg in EMAIL_REGEXES:
                text = re.sub(reg, " [بريد] ", text)
            # replace mentions with [مستخدم]
            text = re.sub(USER_MENTION_REGEX, " [مستخدم] ", text)

        if self.remove_html_markup:
            # remove html line breaks
            text = re.sub("<br />", " ", text)
            # remove html markup
            text = re.sub("</?[^>]+>", " ", text)

        if self.map_hindi_numbers_to_arabic:
            text = text.translate(HINDI_TO_ARABIC_MAP)

        # remove repeated characters >2
        if self.remove_non_digit_repetition:
            text = self._remove_non_digit_repetition(text)

        # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z ])",
                r" \1 ",
                text,
            )

            # re-fix brackets
            text = text.replace("[ رابط ]", "[رابط]")
            text = text.replace("[ بريد ]", "[بريد]")
            text = text.replace("[ مستخدم ]", "[مستخدم]")

            # insert whitespace between words and numbers or numbers and words
            text = re.sub(
                "(\d+)([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)",
                r" \1 \2 ",
                text,
            )
            text = re.sub(
                "([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)(\d+)",
                r" \1 \2 ",
                text,
            )

        # remove unwanted characters
        text = re.sub(self.REJECTED_CHARS_REGEX, " ", text)

        # remove extra spaces
        text = " ".join(text.replace("\uFE0F", "").split())

        if self.apply_farasa_segmentation:
            if self.keep_emojis:
                new_text = []
                for word in text.split():
                    if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
                        new_text.append(word)
                    else:
                        new_text.append(self.farasa_segmenter.segment(word))
                text = " ".join(new_text)
            else:
                text = self.farasa_segmenter.segment(text)
            return self._farasa_segment(text)

        # ALl the other models dont require Farasa Segmentation
        return text

    def _preprocess_v2(self, text: str) -> str:
        text = str(text)
        text = html.unescape(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)
        if self.strip_tatweel:
            text = araby.strip_tatweel(text)

        if self.replace_urls_emails_mentions:
            # replace all possible URLs
            for reg in URL_REGEXES:
                text = re.sub(reg, " [رابط] ", text)
            # REplace Emails with [بريد]
            for reg in EMAIL_REGEXES:
                text = re.sub(reg, " [بريد] ", text)
            # replace mentions with [مستخدم]
            text = re.sub(USER_MENTION_REGEX, " [مستخدم] ", text)

        if self.remove_html_markup:
            # remove html line breaks
            text = re.sub("<br />", " ", text)
            # remove html markup
            text = re.sub("</?[^>]+>", " ", text)

        if self.map_hindi_numbers_to_arabic:
            text = text.translate(HINDI_TO_ARABIC_MAP)

        # remove repeated characters >2
        if self.remove_non_digit_repetition:
            text = self._remove_non_digit_repetition(text)

        # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])",
                r" \1 ",
                text,
            )

            # insert whitespace between words and numbers or numbers and words
            text = re.sub("(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)",
                          r" \1 \2 ", text)
            text = re.sub("([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)",
                          r" \1 \2 ", text)

        if self.replace_slash_with_dash:
            text = text.replace("/", "-")

        # remove unwanted characters
        text = re.sub(self.REJECTED_CHARS_REGEX, " ", text)

        # remove extra spaces
        text = " ".join(text.replace("\uFE0F", "").split())

        if (self.model_name == "bert-base-arabertv2"
                or self.model_name == "bert-large-arabertv2"):
            if self.keep_emojis:
                new_text = []
                for word in text.split():
                    if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
                        new_text.append(word)
                    else:
                        new_text.append(self.farasa_segmenter.segment(word))
                text = " ".join(new_text)
            else:
                text = self.farasa_segmenter.segment(text)
            return self._farasa_segment(text)

        # ALl the other models dont require Farasa Segmentation
        return text

    def _preprocess_v1(self, text: str, do_farasa_tokenization: bool) -> str:
        """
        AraBERTv1 preprocessing Function
        """
        text = str(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)

        text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text)
        text = re.sub("ـ", "", text)
        text = re.sub("[«»]", ' " ', text)

        if self.replace_urls_emails_mentions:
            # replace the [رابط] token with space if you want to clean links
            text = re.sub(REGEX_URL_STEP1, "[رابط]", text)
            text = re.sub(REGEX_URL_STEP2, "[رابط]", text)
            text = re.sub(REGEX_URL, "[رابط]", text)
            text = re.sub(REGEX_EMAIL, "[بريد]", text)
            text = re.sub(REGEX_MENTION, "[مستخدم]", text)
        text = re.sub("…", r"\.", text).strip()
        text = self._remove_redundant_punct(text)

        if self.replace_urls_emails_mentions:
            text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text)
            text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text)
            text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]",
                          " [مستخدم] ", text)

        if self.remove_non_digit_repetition:
            text = self._remove_non_digit_repetition(text)

        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])",
                r" \1 ",
                text,
            )
        if do_farasa_tokenization:
            text = self._tokenize_arabic_words_farasa(text)

        text = " ".join(text.split())

        return text

    def _farasa_segment(self, text: str) -> str:
        line_farasa = text.split()
        segmented_line = []
        for index, word in enumerate(line_farasa):
            if word in ["[", "]"]:
                continue
            if word in ["رابط", "بريد", "مستخدم"
                        ] and line_farasa[index - 1] in [
                            "[",
                            "]",
                        ]:
                segmented_line.append("[" + word + "]")
                continue
            if "+" not in word:
                segmented_line.append(word)
                continue
            segmented_word = self._split_farasa_output(word)
            segmented_line.extend(segmented_word)

        return " ".join(segmented_line)

    def _split_farasa_output(self, word: str) -> str:
        segmented_word = []
        temp_token = ""
        for i, c in enumerate(word):
            if c == "+":
                # if the token is KAF, it could be a suffix or prefix
                if temp_token == "ك":
                    # if we are at the second token, then KAF is surely a prefix
                    if i == 1:
                        segmented_word.append(temp_token + "+")
                        temp_token = ""
                    # If the KAF token is between 2 tokens
                    elif word[i - 2] == "+":
                        # if the previous token is prefix, then this KAF must be a prefix
                        if segmented_word[-1][-1] == "+":
                            segmented_word.append(temp_token + "+")
                            temp_token = ""
                        # else it is a suffix, this KAF could not be a second suffix
                        else:
                            segmented_word.append("+" + temp_token)
                            temp_token = ""
                    # if Kaf is at the end, this is handled with the statement after the loop
                elif temp_token in PREFIX_LIST:
                    segmented_word.append(temp_token + "+")
                    temp_token = ""
                elif temp_token in SUFFIX_LIST:
                    segmented_word.append("+" + temp_token)
                    temp_token = ""
                else:
                    segmented_word.append(temp_token)
                    temp_token = ""
                continue
            temp_token += c
        if temp_token != "":
            if temp_token in SUFFIX_LIST:
                segmented_word.append("+" + temp_token)
            else:
                segmented_word.append(temp_token)
        return segmented_word

    def _tokenize_arabic_words_farasa(self, line_input: str) -> str:

        if self.keep_emojis:
            # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
            line_farasa = []
            for word in line_input.split():
                if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
                    line_farasa.append(word)
                else:
                    line_farasa.append(self.farasa_segmenter.segment(word))
        else:
            line_farasa = self.farasa_segmenter.segment(line_input).split()

        segmented_line = []
        for index, word in enumerate(line_farasa):
            if word in ["[", "]"]:
                continue
            if word in ["رابط", "بريد", "مستخدم"
                        ] and line_farasa[index - 1] in [
                            "[",
                            "]",
                        ]:
                segmented_line.append("[" + word + "]")
                continue
            segmented_word = []
            for token in word.split("+"):
                if token in PREFIX_LIST:
                    segmented_word.append(token + "+")
                elif token in SUFFIX_LIST:
                    segmented_word.append("+" + token)
                else:
                    segmented_word.append(token)
            segmented_line.extend(segmented_word)
        return " ".join(segmented_line)

    def _remove_non_digit_repetition(self, text: str) -> str:
        """
        :param text:  the input text to remove elongation
        :return: delongated text
        """
        # loop over the number of times the regex matched the text
        # OLD
        # for index_ in range(len(re.findall(REGEX_TATWEEL, text))):
        #     elongation = re.search(REGEX_TATWEEL, text)
        #     if elongation:
        #         elongation_pattern = elongation.group()
        #         elongation_replacement = elongation_pattern[0]
        #         elongation_pattern = re.escape(elongation_pattern)
        #         text = re.sub(
        #             elongation_pattern, elongation_replacement, text, flags=re.MULTILINE
        #         )
        #     else:
        #         break

        # New
        text = MULTIPLE_CHAR_PATTERN.sub(r"\1\1", text)
        return text

    def _remove_redundant_punct(self, text: str) -> str:
        text_ = text
        result = re.search(REDUNDANT_PUNCT_PATTERN, text)
        dif = 0
        while result:
            sub = result.group()
            sub = sorted(set(sub), key=sub.index)
            sub = " " + "".join(list(sub)) + " "
            text = "".join((text[:result.span()[0] + dif], sub,
                            text[result.span()[1] + dif:]))
            text_ = "".join(
                (text_[:result.span()[0]], text_[result.span()[1]:])).strip()
            dif = abs(len(text) - len(text_))
            result = re.search(REDUNDANT_PUNCT_PATTERN, text_)
        text = re.sub(r"\s+", " ", text)
        return text.strip()
Beispiel #14
0
from clean_vi_text import fix_contents
import re

import tnkeeh as tn
from farasa.segmenter import FarasaSegmenter

# for bulgarian and turkish
from cube.api import Cube

en_tok = MosesTokenizer(lang="en")
en_normalizer = MosesPunctNormalizer()
# TODO: change hardcoding of jar file to a arg from cli
rdrsegmenter = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar",
                         annotators="wseg",
                         max_heap_size="-Xmx500m")
ar_segmenter = FarasaSegmenter()

bg_cube = Cube(verbose=False)
bg_cube.load("bg")

tr_cube = Cube(verbose=False)
tr_cube.load("tr")


def clean_ar_text(
    text,
    segment=False,
    remove_special_chars=False,
    remove_english=False,
    normalize=False,
    remove_diacritics=False,
Beispiel #15
0
class ArabertPreprocessor:
    """
    A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo.
    It also can unprocess the text ouput of the generated text

    Args:

        model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Defaults to "bert-base-arabertv02". Current accepted models are:

            - :obj:`"bert-base-arabertv01"`: No farasa segmentation.
            - :obj:`"bert-base-arabert"`: with farasa segmentation.
            - :obj:`"bert-base-arabertv02"`: No farasas egmentation.
            - :obj:`"bert-base-arabertv2"`: with farasa segmentation.
            - :obj:`"bert-large-arabertv02"`: No farasas egmentation.
            - :obj:`"bert-large-arabertv2"`: with farasa segmentation.
            - :obj:`"araelectra-base"`: No farasa segmentation.
            - :obj:`"araelectra-base-discriminator"`: No farasa segmentation.
            - :obj:`"araelectra-base-generator"`: No farasa segmentation.
            - :obj:`"aragpt2-base"`: No farasa segmentation.
            - :obj:`"aragpt2-medium"`: No farasa segmentation.
            - :obj:`"aragpt2-large"`: No farasa segmentation.
            - :obj:`"aragpt2-mega"`: No farasa segmentation.

        keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False

        remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True

        replace_urls_emails_mentions(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True

        strip_tashkeel(:obj: `bool`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA)

        strip_tatweel(:obj: `bool`): remove tatweel '\\u0640'

        insert_white_spaces(:obj: `bool`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words

        remove_elongation(:obj: `bool`): replace repetition of more than 2 non-digit character with 2 of this character


    Returns:

        ArabertPreprocessor: the preprocessor class

    Example:

        from preprocess import ArabertPreprocessor

        arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2")

        arabert_prep.preprocess("SOME ARABIC TEXT")
    """
    def __init__(
        self,
        model_name,
        keep_emojis=False,
        remove_html_markup=True,
        replace_urls_emails_mentions=True,
        strip_tashkeel=True,
        strip_tatweel=True,
        insert_white_spaces=True,
        remove_elongation=True,
    ):
        """
        model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Defaults to "bert-base-arabertv02". Current accepted models are:

            - :obj:`"bert-base-arabertv01"`: No farasa segmentation.
            - :obj:`"bert-base-arabert"`: with farasa segmentation.
            - :obj:`"bert-base-arabertv02"`: No farasas egmentation.
            - :obj:`"bert-base-arabertv2"`: with farasa segmentation.
            - :obj:`"bert-large-arabertv02"`: No farasas egmentation.
            - :obj:`"bert-large-arabertv2"`: with farasa segmentation.
            - :obj:`"araelectra-base"`: No farasa segmentation.
            - :obj:`"araelectra-base-discriminator"`: No farasa segmentation.
            - :obj:`"araelectra-base-generator"`: No farasa segmentation.
            - :obj:`"aragpt2-base"`: No farasa segmentation.
            - :obj:`"aragpt2-medium"`: No farasa segmentation.
            - :obj:`"aragpt2-large"`: No farasa segmentation.
            - :obj:`"aragpt2-mega"`: No farasa segmentation.

        keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False

        remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True

        replace_urls_emails_mentions(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True

        strip_tashkeel(:obj: `bool`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA)

        strip_tatweel(:obj: `bool`): remove tatweel '\\u0640'

        insert_white_spaces(:obj: `bool`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words

        remove_elongation(:obj: `bool`): replace repetition of more than 2 non-digit character with 2 of this character

        """
        model_name = model_name.replace("aubmindlab/", "")

        if model_name not in ACCEPTED_MODELS:
            logging.warning(
                "Model provided is not in the accepted model list. Assuming you don't want Farasa Segmentation"
            )
            self.model_name = "bert-base-arabertv02"
        else:
            self.model_name = model_name

        if self.model_name in SEGMENTED_MODELS:
            logging.info(
                "Selected Model requires pre-segmentation, Initializing FarasaSegmenter"
            )
            try:
                from farasa.segmenter import FarasaSegmenter

                self.farasa_segmenter = FarasaSegmenter(interactive=True)
            except:
                logging.warning(
                    "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy"
                )
        else:
            logging.info(
                "Selected Model doesn't require pre-segmentation, skipping FarasaSegmenter initialization"
            )

        self.keep_emojis = keep_emojis
        if self.keep_emojis:
            import emoji

            self.emoji = emoji
            if self.model_name in SEGMENTED_MODELS:
                logging.warning(
                    "Keeping tweets with Farasa Segmentation is 10 times slower"
                )

        self.remove_html_markup = remove_html_markup
        self.replace_urls_emails_mentions = replace_urls_emails_mentions
        self.strip_tashkeel = strip_tashkeel
        self.strip_tatweel = strip_tatweel
        self.insert_white_spaces = insert_white_spaces
        self.remove_elongation = remove_elongation

    def preprocess(self, text):
        """
        Preprocess takes an input text line an applies the same preprocessing used in AraBERT
                            pretraining

        Args:

            text (:obj:`str`): inout text string

        Returns:

            string: A preprocessed string depending on which model was selected
        """
        if self.model_name == "bert-base-arabert":
            return self._old_preprocess(
                text,
                do_farasa_tokenization=True,
            )

        if self.model_name == "bert-base-arabertv01":
            return self._old_preprocess(text, do_farasa_tokenization=False)

        text = str(text)
        text = html.unescape(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)
        if self.strip_tatweel:
            text = araby.strip_tatweel(text)

        if self.replace_urls_emails_mentions:
            # replace all possible URLs
            for reg in url_regexes:
                text = re.sub(reg, " [رابط] ", text)
            # REplace Emails with [بريد]
            for reg in email_regexes:
                text = re.sub(reg, " [بريد] ", text)
            # replace mentions with [مستخدم]
            text = re.sub(user_mention_regex, " [مستخدم] ", text)

        if self.remove_html_markup:
            # remove html line breaks
            text = re.sub("<br />", " ", text)
            # remove html markup
            text = re.sub("</?[^>]+>", " ", text)

        # remove repeated characters >2
        if self.remove_elongation:
            text = self._remove_elongation(text)

        # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])",
                r" \1 ",
                text,
            )

            # insert whitespace between words and numbers or numbers and words
            text = re.sub("(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)",
                          r" \1 \2 ", text)
            text = re.sub("([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)",
                          r" \1 \2 ", text)

        # remove unwanted characters
        if self.keep_emojis:
            emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys()))
            rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex)
            text = re.sub(rejected_chars_regex2, " ", text)
        else:
            text = re.sub(rejected_chars_regex, " ", text)

        # remove extra spaces
        text = " ".join(text.replace("\uFE0F", "").split())

        if (self.model_name == "bert-base-arabertv2"
                or self.model_name == "bert-large-arabertv2"):
            if self.keep_emojis:
                new_text = []
                for word in text.split():
                    if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
                        new_text.append(word)
                    else:
                        new_text.append(self.farasa_segmenter.segment(word))
                text = " ".join(new_text)
            else:
                text = self.farasa_segmenter.segment(text)
            return self._farasa_segment(text)

        # ALl the other models dont require Farasa Segmentation
        return text

    def unpreprocess(self, text, desegment=True):
        """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces.
        The objective is to make the generated text of any model appear natural and not preprocessed.

        Args:
            text (str): input text to be un-preprocessed
            desegment (bool, optional): [whether or not to remove farasa pre-segmentation before]. Defaults to True.

        Returns:
            str: The unpreprocessed (and possibly Farasa-desegmented) text.
        """

        if self.model_name in SEGMENTED_MODELS and desegment:
            text = self.desegment(text)

        # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple
        # https://stackoverflow.com/a/53436792/5381220
        text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"',
                      text)
        text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'",
                      text)
        text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`",
                      text)
        text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—",
                      text)

        # during generation, sometimes the models don't put a space after the dot, this handles it
        text = text.replace(".", " . ")
        text = " ".join(text.split())

        # handle decimals
        text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text)
        text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text)

        text = re.sub(left_and_right_spaced_chars, r"\1", text)
        text = re.sub(left_spaced_chars, r"\1", text)
        text = re.sub(right_spaced_chars, r"\1", text)

        return text

    def desegment(self, text):
        """
        Use this function if sentence tokenization was done using
        `from arabert.preprocess_arabert import preprocess` with Farasa enabled
        AraBERT segmentation using Farasa adds a space after the '+' for prefixes,
        and after before the '+' for suffixes

        Example:
        >>> desegment('ال+ دراس +ات')
        الدراسات
        """
        text = text.replace("+ ", "+")
        text = text.replace(" +", "+")
        text = " ".join(
            [self._desegmentword(word) for word in text.split(" ")])
        return text

    def _desegmentword(self, orig_word: str) -> str:
        """
        Word segmentor that takes a Farasa Segmented Word and removes the '+' signs

        Example:
        >>> _desegmentword("ال+يومي+ة")
        اليومية
        """
        word = orig_word.replace("ل+ال+", "لل")
        if "ال+ال" not in orig_word:
            word = word.replace("ل+ال", "لل")
        word = word.replace("+", "")
        word = word.replace("للل", "لل")
        return word

    def _old_preprocess(self, text, do_farasa_tokenization):
        """
        AraBERTv1 preprocessing Function
        """
        text = str(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)

        text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text)
        text = re.sub("ـ", "", text)
        text = re.sub("[«»]", ' " ', text)

        if self.replace_urls_emails_mentions:
            # replace the [رابط] token with space if you want to clean links
            text = re.sub(regex_url_step1, "[رابط]", text)
            text = re.sub(regex_url_step2, "[رابط]", text)
            text = re.sub(regex_url, "[رابط]", text)
            text = re.sub(regex_email, "[بريد]", text)
            text = re.sub(regex_mention, "[مستخدم]", text)
        text = re.sub("…", r"\.", text).strip()
        text = self._remove_redundant_punct(text)

        if self.replace_urls_emails_mentions:
            text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text)
            text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text)
            text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]",
                          " [مستخدم] ", text)

        if self.remove_elongation:
            text = self._remove_elongation(text)

        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])",
                r" \1 ",
                text,
            )
        if do_farasa_tokenization:
            text = self._tokenize_arabic_words_farasa(text)

        return text.strip()

    def _farasa_segment(self, text):
        line_farasa = text.split()
        segmented_line = []
        for index, word in enumerate(line_farasa):
            if word in ["[", "]"]:
                continue
            if word in ["رابط", "بريد", "مستخدم"
                        ] and line_farasa[index - 1] in [
                            "[",
                            "]",
                        ]:
                segmented_line.append("[" + word + "]")
                continue
            if "+" not in word:
                segmented_line.append(word)
                continue
            segmented_word = self._split_farasa_output(word)
            segmented_line.extend(segmented_word)

        return " ".join(segmented_line)

    def _split_farasa_output(self, word):
        segmented_word = []
        temp_token = ""
        for i, c in enumerate(word):
            if c == "+":
                # if the token is KAF, it could be a suffix or prefix
                if temp_token == "ك":
                    # if we are at the second token, then KAF is surely a prefix
                    if i == 1:
                        segmented_word.append(temp_token + "+")
                        temp_token = ""
                    # If the KAF token is between 2 tokens
                    elif word[i - 2] == "+":
                        # if the previous token is prefix, then this KAF must be a prefix
                        if segmented_word[-1][-1] == "+":
                            segmented_word.append(temp_token + "+")
                            temp_token = ""
                        # else it is a suffix, this KAF could not be a second suffix
                        else:
                            segmented_word.append("+" + temp_token)
                            temp_token = ""
                    # if Kaf is at the end, this is handled with the statement after the loop
                elif temp_token in prefix_list:
                    segmented_word.append(temp_token + "+")
                    temp_token = ""
                elif temp_token in suffix_list:
                    segmented_word.append("+" + temp_token)
                    temp_token = ""
                else:
                    segmented_word.append(temp_token)
                    temp_token = ""
                continue
            temp_token += c
        if temp_token != "":
            if temp_token in suffix_list:
                segmented_word.append("+" + temp_token)
            else:
                segmented_word.append(temp_token)
        return segmented_word

    def _tokenize_arabic_words_farasa(self, line_input):

        if self.keep_emojis:
            # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
            line_farasa = []
            for word in line_input.split():
                if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
                    line_farasa.append(word)
                else:
                    line_farasa.append(self.farasa_segmenter.segment(word))
        else:
            line_farasa = self.farasa_segmenter.segment(line_input).split()

        segmented_line = []
        for index, word in enumerate(line_farasa):
            if word in ["[", "]"]:
                continue
            if word in ["رابط", "بريد", "مستخدم"
                        ] and line_farasa[index - 1] in [
                            "[",
                            "]",
                        ]:
                segmented_line.append("[" + word + "]")
                continue
            segmented_word = []
            for token in word.split("+"):
                if token in prefix_list:
                    segmented_word.append(token + "+")
                elif token in suffix_list:
                    segmented_word.append("+" + token)
                else:
                    segmented_word.append(token)
            segmented_line.extend(segmented_word)
        return " ".join(segmented_line)

    def _remove_elongation(self, text):
        """
        :param text:  the input text to remove elongation
        :return: delongated text
        """
        # loop over the number of times the regex matched the text
        for index_ in range(len(re.findall(regex_tatweel, text))):
            elongation = re.search(regex_tatweel, text)
            if elongation:
                elongation_pattern = elongation.group()
                elongation_replacement = elongation_pattern[0]
                elongation_pattern = re.escape(elongation_pattern)
                text = re.sub(elongation_pattern,
                              elongation_replacement,
                              text,
                              flags=re.MULTILINE)
            else:
                break
        return text

    def _remove_redundant_punct(self, text):
        text_ = text
        result = re.search(redundant_punct_pattern, text)
        dif = 0
        while result:
            sub = result.group()
            sub = sorted(set(sub), key=sub.index)
            sub = " " + "".join(list(sub)) + " "
            text = "".join((text[:result.span()[0] + dif], sub,
                            text[result.span()[1] + dif:]))
            text_ = "".join(
                (text_[:result.span()[0]], text_[result.span()[1]:])).strip()
            dif = abs(len(text) - len(text_))
            result = re.search(redundant_punct_pattern, text_)
        text = re.sub(r"\s+", " ", text)
        return text.strip()
Beispiel #16
0
class ArabertPreprocessor:
    """
    A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo.

    Args:

        model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Current accepted models are:

            - :obj:`"bert-base-arabertv01"`: No farasa segmentation.
            - :obj:`"bert-base-arabert"`: with farasa segmentation.
            - :obj:`"bert-base-arabertv02"`: No farasas egmentation.
            - :obj:`"bert-base-arabertv2"`: with farasa segmentation.
            - :obj:`"bert-large-arabertv02"`: No farasas egmentation.
            - :obj:`"bert-large-arabertv2"`: with farasa segmentation.
            - :obj:`"araelectra-base"`: No farasa segmentation.
            - :obj:`"aragpt2-base"`: No farasa segmentation.
            - :obj:`"aragpt2-medium"`: No farasa segmentation.
            - :obj:`"aragpt2-large"`: No farasa segmentation.
            - :obj:`"aragpt2-mega"`: No farasa segmentation.

        keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False

    Returns:

        ArabertPreprocessor: the preprocessor class

    Example:

        from preprocess import ArabertPreprocessor

        arabert_prep = ArabertPreprocessor("bert-base-arabertv2",keep_emojis=False)
        arabert_prep.preprocess("SOME ARABIC TEXT")
    """
    def __init__(self,
                 model_name,
                 keep_emojis=False,
                 remove_html_markup=True,
                 replace_urls_emails_mentions=True):
        """
        model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Current accepted models are:

            - :obj:`"bert-base-arabertv01"`: No farasa segmentation.
            - :obj:`"bert-base-arabert"`: with farasa segmentation.
            - :obj:`"bert-base-arabertv02"`: No farasas egmentation.
            - :obj:`"bert-base-arabertv2"`: with farasa segmentation.
            - :obj:`"bert-large-arabertv02"`: No farasas egmentation.
            - :obj:`"bert-large-arabertv2"`: with farasa segmentation.
            - :obj:`"araelectra-base"`: No farasa segmentation.
            - :obj:`"aragpt2-base"`: No farasa segmentation.
            - :obj:`"aragpt2-medium"`: No farasa segmentation.
            - :obj:`"aragpt2-large"`: No farasa segmentation.
            - :obj:`"aragpt2-mega"`: No farasa segmentation.

        keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False

        remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True

        remove_html_markup(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True
        """
        model_name = model_name.replace("aubmindlab/", "")

        if model_name not in ACCEPTED_MODELS:
            logging.warning(
                "Model provided is not in the accepted model list. Assuming you don't want Farasa Segmentation"
            )
            self.model_name = "bert-base-arabertv02"
        else:
            self.model_name = model_name

        if self.model_name in SEGMENTED_MODELS:
            logging.info(
                "Selected Model requires pre-segmentation, Initializing FarasaSegmenter"
            )
            try:
                from farasa.segmenter import FarasaSegmenter
                self.farasa_segmenter = FarasaSegmenter(interactive=True)
            except:
                logging.warning(
                    "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy"
                )
        else:
            logging.info(
                "Selected Model doesn't require pre-segmentation, skipping FarasaSegmenter initialization"
            )

        self.keep_emojis = keep_emojis
        if self.keep_emojis:
            import emoji
            self.emoji = emoji
            if self.model_name in SEGMENTED_MODELS:
                logging.warning(
                    "Keeping tweets with Farasa Segmentation is 10 times slower"
                )

        self.remove_html_markup = remove_html_markup
        self.replace_urls_emails_mentions = replace_urls_emails_mentions

    def preprocess(self, text):
        """
        Preprocess takes an input text line an applies the same preprocessing used in AraBERT
                            pretraining

        Args:

            text (:obj:`str`): inout text string

        Returns:

            string: A preprocessed string depending on which model was selected
        """
        if self.model_name == "bert-base-arabert":
            return self._old_preprocess(
                text,
                do_farasa_tokenization=True,
            )

        if self.model_name == "bert-base-arabertv01":
            return self._old_preprocess(text, do_farasa_tokenization=False)

        text = str(text)
        text = html.unescape(text)
        text = araby.strip_tashkeel(text)
        text = araby.strip_tatweel(text)

        if self.replace_urls_emails_mentions:
            # replace all possible URLs
            for reg in url_regexes:
                text = re.sub(reg, " [رابط] ", text)
            # REplace Emails with [بريد]
            for reg in email_regexes:
                text = re.sub(reg, " [بريد] ", text)
            # replace mentions with [مستخدم]
            text = re.sub(user_mention_regex, " [مستخدم] ", text)

        if self.remove_html_markup:
            # remove html line breaks
            text = re.sub("<br />", " ", text)
            # remove html markup
            text = re.sub("</?[^>]+>", " ", text)
        # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
        text = re.sub(
            "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])",
            r" \1 ", text)

        # insert whitespace between words and numbers or numbers and words
        text = re.sub("(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)",
                      r" \1 \2 ", text)
        text = re.sub("([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)",
                      r" \1 \2 ", text)

        # remove unwanted characters
        if self.keep_emojis:
            emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI.keys()))
            rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex)
            text = re.sub(rejected_chars_regex2, " ", text)
        else:
            text = re.sub(rejected_chars_regex, " ", text)

        # remove repeated characters >2
        #text = self._remove_elongation(text)
        # remove extra spaces
        text = " ".join(text.replace("\uFE0F", "").split())

        if self.model_name == "bert-base-arabertv2" or self.model_name == "bert-large-arabertv2":
            if self.keep_emojis:
                new_text = []
                for word in text.split():
                    if word in list(self.emoji.UNICODE_EMOJI.keys()):
                        new_text.append(word)
                    else:
                        new_text.append(self.farasa_segmenter.segment(word))
                text = " ".join(new_text)
            else:
                text = self.farasa_segmenter.segment(text)
            return self._farasa_segment(text)

        # ALl the other models dont require Farasa Segmentation
        return text

    def _old_preprocess(self, text, do_farasa_tokenization):
        """
        AraBERTv1 preprocessing Function
        """
        text = str(text)
        text = araby.strip_tashkeel(text)
        text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text)
        text = re.sub("ـ", "", text)
        text = re.sub("[«»]", ' " ', text)

        if self.replace_urls_emails_mentions:
            # replace the [رابط] token with space if you want to clean links
            text = re.sub(regex_url_step1, "[رابط]", text)
            text = re.sub(regex_url_step2, "[رابط]", text)
            text = re.sub(regex_url, "[رابط]", text)
            text = re.sub(regex_email, "[بريد]", text)
            text = re.sub(regex_mention, "[مستخدم]", text)
        text = re.sub("…", r"\.", text).strip()
        text = self._remove_redundant_punct(text)

        if self.replace_urls_emails_mentions:
            text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text)
            text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text)
            text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]",
                          " [مستخدم] ", text)

        #text = self._remove_elongation(text)
        text = re.sub(
            "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])",
            r" \1 ", text)
        if do_farasa_tokenization:
            text = self._tokenize_arabic_words_farasa(text)

        return text.strip()

    def _farasa_segment(self, text):
        line_farasa = text.split()
        segmented_line = []
        for index, word in enumerate(line_farasa):
            if word in ["[", "]"]:
                continue
            if word in ["رابط", "بريد", "مستخدم"
                        ] and line_farasa[index - 1] in ["[", "]"]:
                segmented_line.append("[" + word + "]")
                continue
            if "+" not in word:
                segmented_line.append(word)
                continue
            segmented_word = self._split_farasa_output(word)
            segmented_line.extend(segmented_word)

        return " ".join(segmented_line)

    def _split_farasa_output(self, word):
        segmented_word = []
        temp_token = ""
        for i, c in enumerate(word):
            if c == "+":
                # if the token is KAF, it could be a suffix or prefix
                if temp_token == "ك":
                    # if we are at the second token, then KAF is surely a prefix
                    if i == 1:
                        segmented_word.append(temp_token + "+")
                        temp_token = ""
                    # If the KAF token is between 2 tokens
                    elif word[i - 2] == "+":
                        # if the previous token is prefix, then this KAF must be a prefix
                        if segmented_word[-1][-1] == "+":
                            segmented_word.append(temp_token + "+")
                            temp_token = ""
                        # else it is a suffix, this KAF could not be a second suffix
                        else:
                            segmented_word.append("+" + temp_token)
                            temp_token = ""
                    # if Kaf is at the end, this is handled with the statement after the loop
                elif temp_token in prefix_list:
                    segmented_word.append(temp_token + "+")
                    temp_token = ""
                elif temp_token in suffix_list:
                    segmented_word.append("+" + temp_token)
                    temp_token = ""
                else:
                    segmented_word.append(temp_token)
                    temp_token = ""
                continue
            temp_token += c
        if temp_token != "":
            if temp_token in suffix_list:
                segmented_word.append("+" + temp_token)
            else:
                segmented_word.append(temp_token)
        return segmented_word

    def _tokenize_arabic_words_farasa(self, line_input):

        if self.keep_emojis:
            # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
            line_farasa = []
            for word in line_input.split():
                if word in list(self.emoji.UNICODE_EMOJI.keys()):
                    line_farasa.append(word)
                else:
                    line_farasa.append(self.farasa_segmenter.segment(word))
        else:
            line_farasa = self.farasa_segmenter.segment(line_input).split()

        segmented_line = []
        for index, word in enumerate(line_farasa):
            if word in ["[", "]"]:
                continue
            if word in ["رابط", "بريد", "مستخدم"
                        ] and line_farasa[index - 1] in ["[", "]"]:
                segmented_line.append("[" + word + "]")
                continue
            segmented_word = []
            for token in word.split("+"):
                if token in prefix_list:
                    segmented_word.append(token + "+")
                elif token in suffix_list:
                    segmented_word.append("+" + token)
                else:
                    segmented_word.append(token)
            segmented_line.extend(segmented_word)
        return " ".join(segmented_line)

    def _remove_elongation(self, word):
        """
        :param word:  the input word to remove elongation
        :return: delongated word
        """
        # loop over the number of times the regex matched the word
        for index_ in range(len(re.findall(regex_tatweel, word))):
            if re.search(regex_tatweel, word):
                elongation_found = re.search(regex_tatweel, word)
                elongation_replacement = elongation_found.group()[0]
                elongation_pattern = elongation_found.group()
                word = re.sub(elongation_pattern,
                              elongation_replacement,
                              word,
                              flags=re.MULTILINE)
            else:
                break
        return word

    def _remove_redundant_punct(self, text):
        text_ = text
        result = re.search(redundant_punct_pattern, text)
        dif = 0
        while result:
            sub = result.group()
            sub = sorted(set(sub), key=sub.index)
            sub = " " + "".join(list(sub)) + " "
            text = "".join((text[:result.span()[0] + dif], sub,
                            text[result.span()[1] + dif:]))
            text_ = "".join(
                (text_[:result.span()[0]], text_[result.span()[1]:])).strip()
            dif = abs(len(text) - len(text_))
            result = re.search(redundant_punct_pattern, text_)
        text = re.sub(r"\s+", " ", text)
        return text.strip()
Beispiel #17
0
    def __init__(
        self,
        model_name,
        keep_emojis=False,
        remove_html_markup=True,
        replace_urls_emails_mentions=True,
        strip_tashkeel=True,
        strip_tatweel=True,
        insert_white_spaces=True,
        remove_elongation=True,
    ):
        """
        model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Defaults to "bert-base-arabertv02". Current accepted models are:

            - :obj:`"bert-base-arabertv01"`: No farasa segmentation.
            - :obj:`"bert-base-arabert"`: with farasa segmentation.
            - :obj:`"bert-base-arabertv02"`: No farasas egmentation.
            - :obj:`"bert-base-arabertv2"`: with farasa segmentation.
            - :obj:`"bert-large-arabertv02"`: No farasas egmentation.
            - :obj:`"bert-large-arabertv2"`: with farasa segmentation.
            - :obj:`"araelectra-base"`: No farasa segmentation.
            - :obj:`"araelectra-base-discriminator"`: No farasa segmentation.
            - :obj:`"araelectra-base-generator"`: No farasa segmentation.
            - :obj:`"aragpt2-base"`: No farasa segmentation.
            - :obj:`"aragpt2-medium"`: No farasa segmentation.
            - :obj:`"aragpt2-large"`: No farasa segmentation.
            - :obj:`"aragpt2-mega"`: No farasa segmentation.

        keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False

        remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True

        replace_urls_emails_mentions(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True

        strip_tashkeel(:obj: `bool`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA)

        strip_tatweel(:obj: `bool`): remove tatweel '\\u0640'

        insert_white_spaces(:obj: `bool`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words

        remove_elongation(:obj: `bool`): replace repetition of more than 2 non-digit character with 2 of this character

        """
        model_name = model_name.replace("aubmindlab/", "")

        if model_name not in ACCEPTED_MODELS:
            logging.warning(
                "Model provided is not in the accepted model list. Assuming you don't want Farasa Segmentation"
            )
            self.model_name = "bert-base-arabertv02"
        else:
            self.model_name = model_name

        if self.model_name in SEGMENTED_MODELS:
            logging.info(
                "Selected Model requires pre-segmentation, Initializing FarasaSegmenter"
            )
            try:
                from farasa.segmenter import FarasaSegmenter

                self.farasa_segmenter = FarasaSegmenter(interactive=True)
            except:
                logging.warning(
                    "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy"
                )
        else:
            logging.info(
                "Selected Model doesn't require pre-segmentation, skipping FarasaSegmenter initialization"
            )

        self.keep_emojis = keep_emojis
        if self.keep_emojis:
            import emoji

            self.emoji = emoji
            if self.model_name in SEGMENTED_MODELS:
                logging.warning(
                    "Keeping tweets with Farasa Segmentation is 10 times slower"
                )

        self.remove_html_markup = remove_html_markup
        self.replace_urls_emails_mentions = replace_urls_emails_mentions
        self.strip_tashkeel = strip_tashkeel
        self.strip_tatweel = strip_tatweel
        self.insert_white_spaces = insert_white_spaces
        self.remove_elongation = remove_elongation
Beispiel #18
0
class ArabertPreprocessor:
    """
    A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo.
    It also can unprocess the text ouput of the generated text

    Args:

        model_name (:obj:`str`): model name from the HuggingFace Models page without
        the aubmindlab tag. Will default to a base Arabic preprocessor if model name was not found.
        Current accepted models are:
  
    Returns:

        ArabertPreprocessor: A preprocessor instance

    Example:

        from preprocess import ArabertPreprocessor

        arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2")

        arabert_prep.preprocess("SOME ARABIC TEXT")
    """

    def __init__(
        self,
        model_name: str,
        keep_emojis: bool = False,
        remove_html_markup: bool = True,
        replace_urls_emails_mentions: bool = True,
        strip_tashkeel: bool = True,
        strip_tatweel: bool = True,
        insert_white_spaces: bool = True,
        remove_non_digit_repetition: bool = True,
        replace_slash_with_dash: bool = None,
        map_hindi_numbers_to_arabic: bool = None,
        apply_farasa_segmentation: bool = None,
    ):

        model_name = model_name.replace("Ebtihal/", "").replace("ebtihalaziz/", "")

        if model_name not in ACCEPTED_MODELS:
            logging.warning(
                """Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor"""
            )
            self.model_name = "Ebtihal/AraDiaBERTo_V2"
        else:
            self.model_name = model_name

        if apply_farasa_segmentation is None:
            if self.model_name in SEGMENTED_MODELS:
                self.apply_farasa_segmentation = True
            else:
                self.apply_farasa_segmentation = False
        else:
            if apply_farasa_segmentation == False and self.model_name in SEGMENTED_MODELS:
                logging.warning(
                    "The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!"
                )

            self.apply_farasa_segmentation = apply_farasa_segmentation

        if self.apply_farasa_segmentation:
            try:
                from farasa.segmenter import FarasaSegmenter

                self.farasa_segmenter = FarasaSegmenter(interactive=True)
            except ModuleNotFoundError:
                logging.error(
                    "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy"
                )

        self.keep_emojis = keep_emojis
        if self.keep_emojis:
            import emoji

            self.emoji = emoji
            if self.apply_farasa_segmentation:
                logging.warning(
                    "Keeping tweets with Farasa Segmentation is 10 times slower"
                )

        self.remove_html_markup = remove_html_markup
        self.replace_urls_emails_mentions = replace_urls_emails_mentions
        self.strip_tashkeel = strip_tashkeel
        self.strip_tatweel = strip_tatweel
        self.insert_white_spaces = insert_white_spaces
        self.remove_non_digit_repetition = remove_non_digit_repetition

        if replace_slash_with_dash is None:
            if self.model_name in SECOND_GEN_MODELS:
                self.replace_slash_with_dash = True
            else:
                self.replace_slash_with_dash = False
        else:
            self.replace_slash_with_dash = replace_slash_with_dash

        if map_hindi_numbers_to_arabic is None:
            if self.model_name in SECOND_GEN_MODELS:
                self.map_hindi_numbers_to_arabic = True
            else:
                self.map_hindi_numbers_to_arabic = False
        else:
            self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic

    def preprocess(self, text: str) -> str:
        """
        Preprocess takes an input text line an applies the same preprocessing used in AraBERT
                            pretraining, or according to settings

        Args:

            text (:obj:`str`): inout text string

        Returns:

            string: A preprocessed string depending on which model was selected
        """
        if (
            self.model_name == "bert-base-arabert"
            or self.model_name == "bert-base-arabertv01"
        ):
            return self._preprocess_v1(
                text,
                do_farasa_tokenization=self.apply_farasa_segmentation,
            )

        if self.model_name in SECOND_GEN_MODELS:
            return self._preprocess_v2(text)

        return self._preprocess_v3(text)

    def unpreprocess(self, text: str, desegment: bool = True) -> str:
        """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces.
        The objective is to make the generated text of any model appear natural and not preprocessed.

        Args:
            text (:obj:`str`): input text to be un-preprocessed
            desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before]..

        Returns:
            str: The unpreprocessed (and possibly Farasa-desegmented) text.
        """

        if self.apply_farasa_segmentation and desegment:
            text = self.desegment(text)

        # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple
        # https://stackoverflow.com/a/53436792/5381220
        text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text)
        text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text)
        text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text)
        text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text)

        # during generation, sometimes the models don't put a space after the dot, this handles it
        text = text.replace(".", " . ")
        text = " ".join(text.split())

        # handle decimals
        text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text)
        text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text)

        text = re.sub(left_and_right_spaced_chars, r"\1", text)
        text = re.sub(left_spaced_chars, r"\1", text)
        text = re.sub(right_spaced_chars, r"\1", text)

        return text

    def desegment(self, text: str) -> str:
        """
        Use this function if sentence tokenization was done using
        `from arabert.preprocess_arabert import preprocess` with Farasa enabled
        AraBERT segmentation using Farasa adds a space after the '+' for prefixes,
        and after before the '+' for suffixes

        Example:
        >>> desegment('ال+ دراس +ات')
        الدراسات
        """
        text = text.replace("+ ", "+")
        text = text.replace(" +", "+")
        text = " ".join([self._desegmentword(word) for word in text.split(" ")])
        return text

    def _desegmentword(self, orig_word: str) -> str:
        """
        Word segmentor that takes a Farasa Segmented Word and removes the '+' signs

        Example:
        >>> _desegmentword("ال+يومي+ة")
        اليومية
        """
        word = orig_word.replace("ل+ال+", "لل")
        if "ال+ال" not in orig_word:
            word = word.replace("ل+ال", "لل")
        word = word.replace("+", "")
        word = word.replace("للل", "لل")
        return word

    def _preprocess_v3(self, text: str) -> str:
        text = str(text)
        text = html.unescape(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)
        if self.strip_tatweel:
            text = araby.strip_tatweel(text)

        if self.replace_urls_emails_mentions:
            # replace all possible URLs
            for reg in url_regexes:
                text = re.sub(reg, " [رابط] ", text)
            # REplace Emails with [بريد]
            for reg in email_regexes:
                text = re.sub(reg, " [بريد] ", text)
            # replace mentions with [مستخدم]
            text = re.sub(user_mention_regex, " [مستخدم] ", text)

        if self.remove_html_markup:
            # remove html line breaks
            text = re.sub("<br />", " ", text)
            # remove html markup
            text = re.sub("</?[^>]+>", " ", text)

        if self.map_hindi_numbers_to_arabic:
            text = text.translate(hindi_to_arabic_map)

        # remove repeated characters >2
        if self.remove_non_digit_repetition:
            text = self._remove_non_digit_repetition(text)

        # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z ])",
                r" \1 ",
                text,
            )

            # re-fix brackets
            text = text.replace("[ رابط ]", "[رابط]")
            text = text.replace("[ بريد ]", "[بريد]")
            text = text.replace("[ مستخدم ]", "[مستخدم]")

            # insert whitespace between words and numbers or numbers and words
            text = re.sub(
                "(\d+)([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)",
                r" \1 \2 ",
                text,
            )
            text = re.sub(
                "([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)(\d+)",
                r" \1 \2 ",
                text,
            )

        # remove unwanted characters
        if self.keep_emojis:
            emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys()))
            rejected_chars_regex2 = "[^%s%s]" % (chars_regexv2, emoji_regex)
            text = re.sub(rejected_chars_regex2, " ", text)
        else:
            text = re.sub(rejected_chars_regexv2, " ", text)

        # remove extra spaces
        text = " ".join(text.replace("\uFE0F", "").split())

        if self.apply_farasa_segmentation:
            if self.keep_emojis:
                new_text = []
                for word in text.split():
                    if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
                        new_text.append(word)
                    else:
                        new_text.append(self.farasa_segmenter.segment(word))
                text = " ".join(new_text)
            else:
                text = self.farasa_segmenter.segment(text)
            return self._farasa_segment(text)

        # ALl the other models dont require Farasa Segmentation
        return text

    def _preprocess_v2(self, text: str) -> str:
        text = str(text)
        text = html.unescape(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)
        if self.strip_tatweel:
            text = araby.strip_tatweel(text)

        if self.replace_urls_emails_mentions:
            # replace all possible URLs
            for reg in url_regexes:
                text = re.sub(reg, " [رابط] ", text)
            # REplace Emails with [بريد]
            for reg in email_regexes:
                text = re.sub(reg, " [بريد] ", text)
            # replace mentions with [مستخدم]
            text = re.sub(user_mention_regex, " [مستخدم] ", text)

        if self.remove_html_markup:
            # remove html line breaks
            text = re.sub("<br />", " ", text)
            # remove html markup
            text = re.sub("</?[^>]+>", " ", text)

        if self.map_hindi_numbers_to_arabic:
            text = text.translate(hindi_to_arabic_map)

        # remove repeated characters >2
        if self.remove_non_digit_repetition:
            text = self._remove_non_digit_repetition(text)

        # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])",
                r" \1 ",
                text,
            )

            # insert whitespace between words and numbers or numbers and words
            text = re.sub(
                "(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text
            )
            text = re.sub(
                "([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text
            )

        if self.replace_slash_with_dash:
            text = text.replace("/", "-")

        # remove unwanted characters
        if self.keep_emojis:
            emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys()))
            rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex)
            text = re.sub(rejected_chars_regex2, " ", text)
        else:
            text = re.sub(rejected_chars_regex, " ", text)

        # remove extra spaces
        text = " ".join(text.replace("\uFE0F", "").split())

        if (
            self.model_name == "bert-base-arabertv2"
            or self.model_name == "bert-large-arabertv2"
        ):
            if self.keep_emojis:
                new_text = []
                for word in text.split():
                    if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
                        new_text.append(word)
                    else:
                        new_text.append(self.farasa_segmenter.segment(word))
                text = " ".join(new_text)
            else:
                text = self.farasa_segmenter.segment(text)
            return self._farasa_segment(text)

        # ALl the other models dont require Farasa Segmentation
        return text

    def _preprocess_v1(self, text: str, do_farasa_tokenization: bool) -> str:
        """
        AraBERTv1 preprocessing Function
        """
        text = str(text)
        if self.strip_tashkeel:
            text = araby.strip_tashkeel(text)

        text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text)
        text = re.sub("ـ", "", text)
        text = re.sub("[«»]", ' " ', text)

        if self.replace_urls_emails_mentions:
            # replace the [رابط] token with space if you want to clean links
            text = re.sub(regex_url_step1, "[رابط]", text)
            text = re.sub(regex_url_step2, "[رابط]", text)
            text = re.sub(regex_url, "[رابط]", text)
            text = re.sub(regex_email, "[بريد]", text)
            text = re.sub(regex_mention, "[مستخدم]", text)
        text = re.sub("…", r"\.", text).strip()
        text = self._remove_redundant_punct(text)

        if self.replace_urls_emails_mentions:
            text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text)
            text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text)
            text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]", " [مستخدم] ", text)

        if self.remove_non_digit_repetition:
            text = self._remove_non_digit_repetition(text)

        if self.insert_white_spaces:
            text = re.sub(
                "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])",
                r" \1 ",
                text,
            )
        if do_farasa_tokenization:
            text = self._tokenize_arabic_words_farasa(text)

        text = " ".join(text.split())

        return text

    def _farasa_segment(self, text: str) -> str:
        line_farasa = text.split()
        segmented_line = []
        for index, word in enumerate(line_farasa):
            if word in ["[", "]"]:
                continue
            if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [
                "[",
                "]",
            ]:
                segmented_line.append("[" + word + "]")
                continue
            if "+" not in word:
                segmented_line.append(word)
                continue
            segmented_word = self._split_farasa_output(word)
            segmented_line.extend(segmented_word)

        return " ".join(segmented_line)

    def _split_farasa_output(self, word: str) -> str:
        segmented_word = []
        temp_token = ""
        for i, c in enumerate(word):
            if c == "+":
                # if the token is KAF, it could be a suffix or prefix
                if temp_token == "ك":
                    # if we are at the second token, then KAF is surely a prefix
                    if i == 1:
                        segmented_word.append(temp_token + "+")
                        temp_token = ""
                    # If the KAF token is between 2 tokens
                    elif word[i - 2] == "+":
                        # if the previous token is prefix, then this KAF must be a prefix
                        if segmented_word[-1][-1] == "+":
                            segmented_word.append(temp_token + "+")
                            temp_token = ""
                        # else it is a suffix, this KAF could not be a second suffix
                        else:
                            segmented_word.append("+" + temp_token)
                            temp_token = ""
                    # if Kaf is at the end, this is handled with the statement after the loop
                elif temp_token in prefix_list:
                    segmented_word.append(temp_token + "+")
                    temp_token = ""
                elif temp_token in suffix_list:
                    segmented_word.append("+" + temp_token)
                    temp_token = ""
                else:
                    segmented_word.append(temp_token)
                    temp_token = ""
                continue
            temp_token += c
        if temp_token != "":
            if temp_token in suffix_list:
                segmented_word.append("+" + temp_token)
            else:
                segmented_word.append(temp_token)
        return segmented_word

    def _tokenize_arabic_words_farasa(self, line_input: str) -> str:

        if self.keep_emojis:
            # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
            line_farasa = []
            for word in line_input.split():
                if word in list(self.emoji.UNICODE_EMOJI["en"].keys()):
                    line_farasa.append(word)
                else:
                    line_farasa.append(self.farasa_segmenter.segment(word))
        else:
            line_farasa = self.farasa_segmenter.segment(line_input).split()

        segmented_line = []
        for index, word in enumerate(line_farasa):
            if word in ["[", "]"]:
                continue
            if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [
                "[",
                "]",
            ]:
                segmented_line.append("[" + word + "]")
                continue
            segmented_word = []
            for token in word.split("+"):
                if token in prefix_list:
                    segmented_word.append(token + "+")
                elif token in suffix_list:
                    segmented_word.append("+" + token)
                else:
                    segmented_word.append(token)
            segmented_line.extend(segmented_word)
        return " ".join(segmented_line)

    def _remove_non_digit_repetition(self, text: str) -> str:
        """
        :param text:  the input text to remove elongation
        :return: delongated text
        """
        # loop over the number of times the regex matched the text
        # OLD
        # for index_ in range(len(re.findall(regex_tatweel, text))):
        #     elongation = re.search(regex_tatweel, text)
        #     if elongation:
        #         elongation_pattern = elongation.group()
        #         elongation_replacement = elongation_pattern[0]
        #         elongation_pattern = re.escape(elongation_pattern)
        #         text = re.sub(
        #             elongation_pattern, elongation_replacement, text, flags=re.MULTILINE
        #         )
        #     else:
        #         break

        # New
        text = multiple_char_pattern.sub(r"\1\1", text)
        return text

    def _remove_redundant_punct(self, text: str) -> str:
        text_ = text
        result = re.search(redundant_punct_pattern, text)
        dif = 0
        while result:
            sub = result.group()
            sub = sorted(set(sub), key=sub.index)
            sub = " " + "".join(list(sub)) + " "
            text = "".join(
                (text[: result.span()[0] + dif], sub, text[result.span()[1] + dif :])
            )
            text_ = "".join(
                (text_[: result.span()[0]], text_[result.span()[1] :])
            ).strip()
            dif = abs(len(text) - len(text_))
            result = re.search(redundant_punct_pattern, text_)
        text = re.sub(r"\s+", " ", text)
        return text.strip()