def __init__(self, model_name, keep_emojis=False, remove_html_markup=True, replace_urls_emails_mentions=True): """ model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Current accepted models are: - :obj:`"bert-base-arabertv01"`: No farasa segmentation. - :obj:`"bert-base-arabert"`: with farasa segmentation. - :obj:`"bert-base-arabertv02"`: No farasas egmentation. - :obj:`"bert-base-arabertv2"`: with farasa segmentation. - :obj:`"bert-large-arabertv02"`: No farasas egmentation. - :obj:`"bert-large-arabertv2"`: with farasa segmentation. - :obj:`"araelectra-base"`: No farasa segmentation. - :obj:`"aragpt2-base"`: No farasa segmentation. - :obj:`"aragpt2-medium"`: No farasa segmentation. - :obj:`"aragpt2-large"`: No farasa segmentation. - :obj:`"aragpt2-mega"`: No farasa segmentation. keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True remove_html_markup(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True """ model_name = model_name.replace("aubmindlab/","") if model_name not in ACCEPTED_MODELS: logging.warning( "Model provided is not in the accepted model list. Assuming you don't want Farasa Segmentation" ) self.model_name = "bert-base-arabertv02" else: self.model_name = model_name if self.model_name in SEGMENTED_MODELS: logging.info("Selected Model requires pre-segmentation, Initializing FarasaSegmenter") try: from farasa.segmenter import FarasaSegmenter self.farasa_segmenter = FarasaSegmenter(interactive=True) except: logging.warning("farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy") else: logging.info("Selected Model doesn't require pre-segmentation, skipping FarasaSegmenter initialization") self.keep_emojis = keep_emojis if self.keep_emojis: import emoji self.emoji = emoji if self.model_name in SEGMENTED_MODELS: logging.warning("Keeping tweets with Farasa Segmentation is 10 times slower") self.remove_html_markup = remove_html_markup self.replace_urls_emails_mentions = replace_urls_emails_mentions
def __init__( self, unk_token="<UNK>", pad_token="<PAD>", segment=False, vocab_size=10000, segm_token="+", clean=False, normalize=False, ): """Constructor Args: unk_token (str, optional): reserved token for unknowns. Defaults to "<UNK>". pad_token (str, optional): reserved token for padding. Defaults to "<PAD>". segment (bool, optional): segment using farasa. Defaults to False. max_tokens (int, optional): max number of vocabulary. Defaults to 10000. segm_token (str, optional): reserved token for segmentation. Defaults to '+'. clean (bool, optional): remove tashkeel, english and special chars. Defaults to False. normalize (bool, optional): normalize chars. Defaults to False. """ self.segm_token = segm_token self.vocab_size = vocab_size self.unk_token = unk_token self.pad_token = pad_token self.segment = segment self.clean = clean self.normalize = normalize self.vocab = None # to be filled by child classes # relative path self.rel_path = os.path.dirname(__file__) norm_dict_path = os.path.join( self.rel_path, "dictionaries/normalization_dictionary.pl") cach_dict_path = os.path.join(self.rel_path, "dictionaries/cached.pl") self.norm_dict = pickle.load(open(norm_dict_path, "rb")) self.cached = pickle.load(open(cach_dict_path, "rb")) if self.segment: print("Initializing Farasa") # suppress farasa stdout # WARNING: this is LINUX ONLY command! old_stdout = sys.stdout sys.stdout = open(os.devnull, "w") self.segmenter = FarasaSegmenter(interactive=True) # resume farasa stdout sys.stdout = old_stdout
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.do_farasa_tokenization: if FLAGS.use_farasapy: from farasa.segmenter import FarasaSegmenter farasa_segmenter = FarasaSegmenter(interactive=True) else: from py4j.java_gateway import JavaGateway gateway = JavaGateway.launch_gateway( classpath=FLAGS.path_to_farasa) farasa_segmenter = gateway.jvm.com.qcri.farasa.segmenter.Farasa() else: farasa = None with tf.gfile.Open(FLAGS.input_file, "r") as reader: input_data = json.load(reader)["data"] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph["context"] = clean_preprocess( paragraph["context"], do_farasa_tokenization=FLAGS.do_farasa_tokenization, farasa=farasa_segmenter, use_farasapy=FLAGS.use_farasapy, ) for qas in paragraph["qas"]: qas["question"] = clean_preprocess( qas["question"], do_farasa_tokenization=FLAGS.do_farasa_tokenization, farasa=farasa_segmenter, use_farasapy=FLAGS.use_farasapy, ) qas["answers"][0]["text"] = clean_preprocess( qas["answers"][0]["text"], do_farasa_tokenization=FLAGS.do_farasa_tokenization, farasa=farasa_segmenter, use_farasapy=FLAGS.use_farasapy, ) qas["answers"][0]["answer_start"] = paragraph["context"].find( qas["answers"][0]["text"]) if qas["answers"][0]["answer_start"] == -1: tf.logging.warning( "Could not find answer for question '%s' : '%s' vs. '%s'", qas["id"], paragraph["context"], qas["answers"][0]["text"], ) input_data = { "data": input_data, "version": "1.1", "preprocess": "True", } with tf.gfile.Open(FLAGS.output_file, "w") as writer: json.dump(input_data, writer)
def __init__(self): self.langs = ["zh -> ar", "ar -> zh", "ar -> fr", "fr -> ar"] self.model_ar2zh = TransformerModel.from_pretrained( "checkpoints-ar2zh", checkpoint_file='checkpoint_best.pt', data_name_or_path='data-bin', bpe='subword_nmt', bpe_codes='data-bin/code') self.model_zh2ar = TransformerModel.from_pretrained( "checkpoints-zh2ar", checkpoint_file='checkpoint_best.pt', data_name_or_path='data-bin', bpe='subword_nmt', bpe_codes='data-bin/code') self.segmenter = FarasaSegmenter(interactive=True) self.models = {"ar2zh": self.model_ar2zh, "zh2ar": self.model_zh2ar}
class Translator(): def __init__(self): self.langs = ["zh -> ar", "ar -> zh", "ar -> fr", "fr -> ar"] self.model_ar2zh = TransformerModel.from_pretrained( "checkpoints-ar2zh", checkpoint_file='checkpoint_best.pt', data_name_or_path='data-bin', bpe='subword_nmt', bpe_codes='data-bin/code') self.model_zh2ar = TransformerModel.from_pretrained( "checkpoints-zh2ar", checkpoint_file='checkpoint_best.pt', data_name_or_path='data-bin', bpe='subword_nmt', bpe_codes='data-bin/code') self.segmenter = FarasaSegmenter(interactive=True) self.models = {"ar2zh": self.model_ar2zh, "zh2ar": self.model_zh2ar} def supported_languages(self): return self.langs def translate(self, src, tgt, text): # chinese is segmented arabic is not src2trg = src + "2" + tgt model = self.models[src2trg] model.cuda() if src == "ar": text = self.segment_ar(text) text = text.replace("+", "+ ") output = model.translate(text) if tgt == "ar": output = output.replace("+ ", "") return output def segment_ar(self, sent: str): segmented = self.segmenter.segment(sent) toks = segmented.split(" ") ret_sent = "" for tok in toks: segments = re.split("(?<=[+])", tok) for i in range(len(segments)): ret_sent = ret_sent + segments[i] + " " ret_sent = ret_sent.strip() return ret_sent
def transform(self, sentences_list, extract_and_paste_emojies=False): """ transforming data and applying all pre-processing steps over it. In case 'fit' is required, it will yiled an error in case data is not fitted yet Parameters ---------- :param sentences_list: list (of arabic sentences) list of sentences to apply the function on. Each sentence is treated independently :param extract_and_paste_emojies: boolean. Default: False whether to handle emojies is a special way. Currently we only know how to handle emojies in a very specific way (extract them and then paste them at the end of the sentence - not ideal) TODO: handle empjies in a better way (do not convert them to ?? and "leave" them as is in the sentence) :return: list list of transformed arabic sentences. Same input list, but after the transform function has been applied over all of them """ farasa_segmenter = FarasaSegmenter(interactive=True) new_sentences_list = list() # looping over each sentence for cur_text in sentences_list: # in case we decided to use the farase preprocess if self.use_default_farsa_preprocess: preprocessed_text = preprocess(cur_text, do_farasa_tokenization=True, farasa=farasa_segmenter, use_farasapy=True) preprocessed_text_as_list = preprocessed_text.split(" ") # removal of punctuation (e.g., '?', '!?!') preprocessed_text_as_list = [ cur_word for cur_word in preprocessed_text_as_list if not all(j in string.punctuation for j in cur_word) ] if extract_and_paste_emojies: emojies_found = self.extract_emojis(text=cur_text) preprocessed_text_as_list.extend(emojies_found) new_sentences_list.append(' '.join(preprocessed_text_as_list)) # currently not doing anything in such case, only supports the default case else: new_sentences_list.append(cur_text) return new_sentences_list
#%% import pandas as pd from sklearn.model_selection import train_test_split from preprocess_arabert import preprocess from tqdm import tqdm tqdm.pandas() import arabert import sys sys.path.append("arabert") from arabert import modeling, optimization, tokenization from arabert.run_classifier import input_fn_builder, model_fn_builder from farasa.segmenter import FarasaSegmenter farasa_segmenter = FarasaSegmenter(interactive=True) # gateway = JavaGateway.launch_gateway(classpath='./PATH_TO_FARASA/FarasaSegmenterJar.jar') # farasa = gateway.jvm.com.qcri.farasa.segmenter.Farasa() class Dataset: def __init__( self, name, train, test, label_list, train_InputExamples=None, test_InputExamples=None, train_features=None, test_features=None,
# https://r12a.github.io/scripts/tutorial/summaries/arabic sample =\ ''' يُشار إلى أن اللغة العربية يتحدثها أكثر من 422 مليون نسمة ويتوزع متحدثوها في المنطقة المعروفة باسم الوطن العربي بالإضافة إلى العديد من المناطق الأخرى المجاورة مثل الأهواز وتركيا وتشاد والسنغال وإريتريا وغيرها. وهي اللغة الرابعة من لغات منظمة الأمم المتحدة الرسمية الست. ''' ''' --------------------- non interactive mode --------------------- ''' print("original sample:", sample) print('----------------------------------------') print("Farasa features, noninteractive mode.") print('----------------------------------------') segmenter = FarasaSegmenter() segmented = segmenter.segment(sample) print("sample segmented:", segmented) print("----------------------------------------------") stemmer = FarasaStemmer() stemmed = stemmer.stem(sample) print("sample stemmed:", stemmed) print("----------------------------------------------") pos_tagger = FarasaPOSTagger() pos_tagged = pos_tagger.tag(sample) print("sample POS Tagged", pos_tagged) print("----------------------------------------------") named_entity_recognizer = FarasaNamedEntityRecognizer()
class BaseTokenizer: """ Base Tokenizer that implements the basic functionalities of a tokenizer """ def __init__( self, unk_token="<UNK>", pad_token="<PAD>", segment=False, vocab_size=10000, segm_token="+", clean=False, normalize=False, ): """Constructor Args: unk_token (str, optional): reserved token for unknowns. Defaults to "<UNK>". pad_token (str, optional): reserved token for padding. Defaults to "<PAD>". segment (bool, optional): segment using farasa. Defaults to False. max_tokens (int, optional): max number of vocabulary. Defaults to 10000. segm_token (str, optional): reserved token for segmentation. Defaults to '+'. clean (bool, optional): remove tashkeel, english and special chars. Defaults to False. normalize (bool, optional): normalize chars. Defaults to False. """ self.segm_token = segm_token self.vocab_size = vocab_size self.unk_token = unk_token self.pad_token = pad_token self.segment = segment self.clean = clean self.normalize = normalize # relative path self.rel_path = os.path.dirname(__file__) norm_dict_path = os.path.join( self.rel_path, "dictionaries/normalization_dictionary.pl") cach_dict_path = os.path.join(self.rel_path, "dictionaries/cached.pl") self.norm_dict = pickle.load(open(norm_dict_path, "rb")) self.cached = pickle.load(open(cach_dict_path, "rb")) if self.segment: print("Initializing Farasa") # suppress farasa stdout # WARNING: this is LINUX ONLY command! old_stdout = sys.stdout sys.stdout = open(os.devnull, "w") self.segmenter = FarasaSegmenter(interactive=True) # resume farasa stdout sys.stdout = old_stdout def process_data(self, file_path): """ Read, segment, clean, normalize and split Args: file_path (str): the directory of the data to read """ with open(file_path, "r") as f: print("Reading the data ...") self.corpus = f.read() if self.segment: print("Segmenting the data ...") self.corpus = self.segmenter.segment(self.corpus) self.corpus = re.sub(r"[+]", self.segm_token, self.corpus) if self.clean: print("Cleaning the data ...") self.corpus = clean_data(self.corpus) if self.normalize: print("Normalizing the data ...") self.corpus = normalize_data(self.corpus, self.norm_dict) Path("data/raw").mkdir(parents=True, exist_ok=True) # self.train_text, self.valid_text, self.test_text = self._split_corpus() self._write_data("data/raw/train.txt", self.corpus) # self._write_data("data/raw/valid.txt", self.valid_text) # self._write_data("data/raw/test.txt", self.test_text) # del self.train_text, self.valid_text, self.test_text del self.corpus def _get_tokens_frequency_quickly(self, file_path): """ Get the tokens frequency quickly using memory mapping Args: file_path (str): the directory of the data to read Returns: Dict: frequency based dictionary """ encoding = "utf8" with open(file_path, "r", encoding=encoding, errors="ignore") as f: with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m: m.read(0) i = 0 size_to_read = int(1e9) freq = Counter([]) pbar = tqdm(total=int(m.size() / size_to_read)) while i < m.size(): cur_txt = "" data = m.read(size_to_read) i += size_to_read try: cur_txt = data.decode(encoding) except: cur_txt = (data + m.read(1)).decode(encoding) i += 1 freq.update(cur_txt.split(" ")) pbar.update(1) return freq def _write_data(self, path, data): """ Write the string data to a path Args: file_path (str): the directory of the data to read """ # TOCHECK: I think this code will break if the path does not exist. open(path, "w").write(data) def _split_corpus(self): """ Split the data into train, valid and test Returns: Tuple: train, valid, test """ split_length = int(len(self.corpus) * 0.8) trainval_text, test_text = ( self.corpus[:split_length], self.corpus[split_length:], ) split_length = int(len(trainval_text) * 0.8) train_text, val_text = ( trainval_text[:split_length], trainval_text[split_length:], ) return train_text, val_text, test_text def _get_tokens_frequency(self, file_path): """ Get tokens frequency using a dictionary Args: file_path (str): file path to read Returns: dict : dict containing frequency """ text = open(file_path, "r").read() tokens_frequency = defaultdict(int) for word in text.split(" "): tokens_frequency[word] += 1 return dict(tokens_frequency) def _split_word(self, word, number_of_subwords): """Split a word into a specific number of sub-words Args: word (str): word input number_of_subwords (int): number of subtokens to generate from the word Returns: list: list of subwords """ assert number_of_subwords > 0 def _split(_word, _number_of_subwords): groups = [] if _number_of_subwords == 1: groups.append(["##" + _word]) else: for i in range(1, len(_word), 1): groups.extend( ["##" + _word[:i], *group] for group in _split(_word[i:], _number_of_subwords - 1) if len(group) == _number_of_subwords - 1) return groups groups_of_subwords = _split(word, number_of_subwords) out_groups = [] for group in groups_of_subwords: group[0] = group[0].replace("##", "") out_groups.append(group) return out_groups def _split_word_cached(self, word, number_of_subwords): """Faster version of word splitting Args: word (word): word to be split number_of_subwords (int): number of subwords to split the word to Returns: list: subwords """ if number_of_subwords == 1: return [[word]] n = len(word) - 1 all_binaries = self.cached[n, number_of_subwords - 1] return [split_on_binary(word, binary) for binary in all_binaries] def _tokenize_from_dict(self, text, freq_dict, cache=False): """Tokenize using the frequency dictionary Args: text (str): input string Returns: list: generated tokens """ assert freq_dict tokens = [] output_tokens = [] for word in text.split(): if word in freq_dict: output_tokens.append(word) else: for i in range(2, len(word) + 1, 1): if cache: groups_of_subwords = self._split_word_cached(word, i) else: groups_of_subwords = self._split_word(word, i) # filter out groups groups_of_valid_subwords = list( filter( lambda group: all(subword in freq_dict.keys() for subword in group), groups_of_subwords, )) if groups_of_valid_subwords: break if len(groups_of_valid_subwords) == 0: output_tokens.append(self.unk_token) else: sorted_groups_of_valid_subwords = sorted( groups_of_valid_subwords, key=lambda group: sum(freq_dict[subword] for subword in group), ) tokens = sorted_groups_of_valid_subwords[-1] for token in tokens: output_tokens.append(str(token)) return output_tokens def _truncate_dict(self, freq_dict): """Truncate a frequency dictionary and add reserved tokens Args: freq_dict (dict): frequency dictionary Returns: dict: truncated dictionary based on the vocab size """ sorted_tokens_frequency = { k: v for k, v in sorted( freq_dict.items(), key=lambda x: x[1], reverse=True) } limited_tokens_frequency = dict() limited_tokens_frequency[self.unk_token] = -1 limited_tokens_frequency[self.pad_token] = -1 limited_tokens_frequency.update({ k: v for k, v in list(sorted_tokens_frequency.items())[:self.vocab_size] }) return limited_tokens_frequency def encode(self, text): """ Convert text to ids """ raise NotImplementedError def decode(self, encoded): """ Convert ids to string """ return NotImplementedError def tokenize(self, text): """ Convert text to tokens """ raise NotImplementedError def detokenize(self, tokens): """ Convert tokens to text """ raise NotImplementedError def encode_and_save(self): """ Encode all the files then save as numpy """ Path("data/encoded").mkdir(parents=True, exist_ok=True) for file_path in os.listdir("data/raw/"): ids = self.encode(open(f"data/raw/{file_path}", "r").read()) np.save(f"data/encoded/{file_path[:-4]}.npy", ids) def encode_sentences(self, sentences, max_length=20): """ Encode a list of sentences using the trained model Args: sentences (list): list of sentences max_length (int, optional): specify the max length of encodings. Defaults to 100. Returns: [np.array]: numpy array of encodings """ encodings = [] for sent in sentences: tokens = self.tokenize(sent) encoded = [] for i in range(max_length): if i < len(tokens): current_token = tokens[i] else: current_token = self.pad_token encoded.append(self._tokens_list().index(current_token)) encodings.append(encoded) return np.array(encodings)
def __init__( self, model_name: str, keep_emojis: bool = False, remove_html_markup: bool = True, replace_urls_emails_mentions: bool = True, strip_tashkeel: bool = True, strip_tatweel: bool = True, insert_white_spaces: bool = True, remove_non_digit_repetition: bool = True, replace_slash_with_dash: bool = None, map_hindi_numbers_to_arabic: bool = None, apply_farasa_segmentation: bool = None, ): model_name = model_name.replace("Ebtihal/", "").replace("ebtihalaziz/", "") if model_name not in ACCEPTED_MODELS: logging.warning( """Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor""" ) self.model_name = "Ebtihal/AraDiaBERTo_V2" else: self.model_name = model_name if apply_farasa_segmentation is None: if self.model_name in SEGMENTED_MODELS: self.apply_farasa_segmentation = True else: self.apply_farasa_segmentation = False else: if apply_farasa_segmentation == False and self.model_name in SEGMENTED_MODELS: logging.warning( "The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!" ) self.apply_farasa_segmentation = apply_farasa_segmentation if self.apply_farasa_segmentation: try: from farasa.segmenter import FarasaSegmenter self.farasa_segmenter = FarasaSegmenter(interactive=True) except ModuleNotFoundError: logging.error( "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy" ) self.keep_emojis = keep_emojis if self.keep_emojis: import emoji self.emoji = emoji if self.apply_farasa_segmentation: logging.warning( "Keeping tweets with Farasa Segmentation is 10 times slower" ) self.remove_html_markup = remove_html_markup self.replace_urls_emails_mentions = replace_urls_emails_mentions self.strip_tashkeel = strip_tashkeel self.strip_tatweel = strip_tatweel self.insert_white_spaces = insert_white_spaces self.remove_non_digit_repetition = remove_non_digit_repetition if replace_slash_with_dash is None: if self.model_name in SECOND_GEN_MODELS: self.replace_slash_with_dash = True else: self.replace_slash_with_dash = False else: self.replace_slash_with_dash = replace_slash_with_dash if map_hindi_numbers_to_arabic is None: if self.model_name in SECOND_GEN_MODELS: self.map_hindi_numbers_to_arabic = True else: self.map_hindi_numbers_to_arabic = False else: self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic
#%% from farasa.segmenter import FarasaSegmenter from tqdm import tqdm import re import editdistance import pyarabic.araby as araby from desegmentors import desegmentword #%% fs = FarasaSegmenter(interactive=True) # %% with open('data/100ksentences.csv', 'r', encoding='utf-8') as f: text = f.read() # %% all_non_arabic_characters = r"[^\u0621-\u063A\u0641-\u064A ]+" def normalize_alef(s): s = s.replace(araby.ALEF_HAMZA_ABOVE, araby.ALEF) s = s.replace(araby.ALEF_HAMZA_BELOW, araby.ALEF) s = s.replace(araby.ALEF_MADDA, araby.ALEF) return s # %% #Clean and get original and segmented words all_words = [] for line in tqdm(text.split('\n')): cleaned_line = normalize_alef(line) cleaned_line = re.sub(all_non_arabic_characters, "", cleaned_line)
def __init__( self, model_name: str, remove_html_markup: bool = True, replace_urls_emails_mentions: bool = True, strip_tashkeel: bool = True, strip_tatweel: bool = True, insert_white_spaces: bool = True, remove_non_digit_repetition: bool = True, keep_emojis: bool = None, replace_slash_with_dash: bool = None, map_hindi_numbers_to_arabic: bool = None, apply_farasa_segmentation: bool = None, ): model_name = model_name.replace("aubmindlab/", "").replace("wissamantoun/", "") if model_name not in ACCEPTED_MODELS: logging.warning( """Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor""" ) self.model_name = "bert-base-arabertv02" else: self.model_name = model_name if apply_farasa_segmentation is None: if self.model_name in SEGMENTED_MODELS: self.apply_farasa_segmentation = True else: self.apply_farasa_segmentation = False else: if (apply_farasa_segmentation == False and self.model_name in SEGMENTED_MODELS): logging.warning( "The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!" ) self.apply_farasa_segmentation = apply_farasa_segmentation if self.apply_farasa_segmentation: try: from farasa.segmenter import FarasaSegmenter self.farasa_segmenter = FarasaSegmenter(interactive=True) except ModuleNotFoundError: logging.error( "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy" ) if keep_emojis is None: if self.model_name in TWEET_MODELS: self.keep_emojis = True else: self.keep_emojis = False else: if keep_emojis == False and self.model_name in TWEET_MODELS: logging.warning( "The selected model_name is trained on emojis, but keep_emojis was set to False!" ) self.keep_emojis = keep_emojis if self.keep_emojis: import emoji self.emoji = emoji if self.apply_farasa_segmentation: logging.warning( "Keeping tweets with Farasa Segmentation is 10 times slower" ) emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys())) self.REJECTED_CHARS_REGEX = "[^%s%s]" % ( CHARS_REGEX if self.model_name in SECOND_GEN_MODELS else CHARS_REGEXV2, emoji_regex, ) else: self.REJECTED_CHARS_REGEX = (REJECTED_CHARS_REGEX if self.model_name in SECOND_GEN_MODELS else REJECTED_CHARS_REGEXV2) self.remove_html_markup = remove_html_markup self.replace_urls_emails_mentions = replace_urls_emails_mentions self.strip_tashkeel = strip_tashkeel self.strip_tatweel = strip_tatweel self.insert_white_spaces = insert_white_spaces self.remove_non_digit_repetition = remove_non_digit_repetition if replace_slash_with_dash is None: if self.model_name in SECOND_GEN_MODELS: self.replace_slash_with_dash = True else: self.replace_slash_with_dash = False else: self.replace_slash_with_dash = replace_slash_with_dash if map_hindi_numbers_to_arabic is None: if self.model_name in SECOND_GEN_MODELS: self.map_hindi_numbers_to_arabic = True else: self.map_hindi_numbers_to_arabic = False else: self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic
class ArabertPreprocessor: """ A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo. It also can unprocess the text ouput of the generated text Args: model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Will default to a base Arabic preprocessor if model name was not found. Current accepted models are: - "bert-base-arabertv01" - "bert-base-arabert" - "bert-base-arabertv02" - "bert-base-arabertv2" - "bert-base-arabertv02-twitter" - "bert-large-arabertv02" - "bert-large-arabertv2" - "bert-large-arabertv02-twitter" - "araelectra-base" - "araelectra-base-discriminator" - "araelectra-base-generator" - "araelectra-base-artydiqa" - "aragpt2-base" - "aragpt2-medium" - "aragpt2-large" - "aragpt2-mega" remove_html_markup(:obj: `bool`, `optional`, defaults to :obj:`True`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. replace_urls_emails_mentions(:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to replace email urls and mentions by special tokens. strip_tashkeel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA). strip_tatweel(:obj:`bool`, `optional`, defaults to :obj:`True`): remove tatweel '\\u0640'. insert_white_spaces(:obj:`bool`, `optional`, defaults to :obj:`True`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words. remove_non_digit_repetition(:obj:`bool`, `optional`, defaults to :obj:`True`): replace repetition of more than 2 non-digit character with 2 of this character. replace_slash_with_dash(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in AraBERTv02, AraELECTRA and AraGPT2. Set to False to force disable, and True to force enable. Replaces the "/" with "-", since "/" is missing from AraBERTv2, AraELECTRA and ARAGPT2 vocabulary. map_hindi_numbers_to_arabic(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in AraBERTv02, AraELECTRA and AraGPT2.Set to False to force disable, and True to force enable. Replaces hindi numbers with the corresponding Arabic one. ex: "١٩٩٥" --> "1995". This is behavior is present by default in AraBERTv1 and v2 (with pre-segmentation), and fixes the issue of caused by a bug when inserting white spaces. apply_farasa_segmentation(:obj:`bool`, `optional`, defaults to :obj:`None`): Will be automatically set to True in AraBERTv2, and AraBERTv1. Set to False to force disable, and True to force enable. keep_emojis(:obj:`bool`, `optional`, defaults to :obj:`None`): don't remove emojis while preprocessing. Will be automatically set to True in AraBERT trained on tweets. Returns: ArabertPreprocessor: A preprocessor instance Example: from preprocess import ArabertPreprocessor arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2") arabert_prep.preprocess("SOME ARABIC TEXT") """ def __init__( self, model_name: str, remove_html_markup: bool = True, replace_urls_emails_mentions: bool = True, strip_tashkeel: bool = True, strip_tatweel: bool = True, insert_white_spaces: bool = True, remove_non_digit_repetition: bool = True, keep_emojis: bool = None, replace_slash_with_dash: bool = None, map_hindi_numbers_to_arabic: bool = None, apply_farasa_segmentation: bool = None, ): model_name = model_name.replace("aubmindlab/", "").replace("wissamantoun/", "") if model_name not in ACCEPTED_MODELS: logging.warning( """Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor""" ) self.model_name = "bert-base-arabertv02" else: self.model_name = model_name if apply_farasa_segmentation is None: if self.model_name in SEGMENTED_MODELS: self.apply_farasa_segmentation = True else: self.apply_farasa_segmentation = False else: if (apply_farasa_segmentation == False and self.model_name in SEGMENTED_MODELS): logging.warning( "The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!" ) self.apply_farasa_segmentation = apply_farasa_segmentation if self.apply_farasa_segmentation: try: from farasa.segmenter import FarasaSegmenter self.farasa_segmenter = FarasaSegmenter(interactive=True) except ModuleNotFoundError: logging.error( "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy" ) if keep_emojis is None: if self.model_name in TWEET_MODELS: self.keep_emojis = True else: self.keep_emojis = False else: if keep_emojis == False and self.model_name in TWEET_MODELS: logging.warning( "The selected model_name is trained on emojis, but keep_emojis was set to False!" ) self.keep_emojis = keep_emojis if self.keep_emojis: import emoji self.emoji = emoji if self.apply_farasa_segmentation: logging.warning( "Keeping tweets with Farasa Segmentation is 10 times slower" ) emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys())) self.REJECTED_CHARS_REGEX = "[^%s%s]" % ( CHARS_REGEX if self.model_name in SECOND_GEN_MODELS else CHARS_REGEXV2, emoji_regex, ) else: self.REJECTED_CHARS_REGEX = (REJECTED_CHARS_REGEX if self.model_name in SECOND_GEN_MODELS else REJECTED_CHARS_REGEXV2) self.remove_html_markup = remove_html_markup self.replace_urls_emails_mentions = replace_urls_emails_mentions self.strip_tashkeel = strip_tashkeel self.strip_tatweel = strip_tatweel self.insert_white_spaces = insert_white_spaces self.remove_non_digit_repetition = remove_non_digit_repetition if replace_slash_with_dash is None: if self.model_name in SECOND_GEN_MODELS: self.replace_slash_with_dash = True else: self.replace_slash_with_dash = False else: self.replace_slash_with_dash = replace_slash_with_dash if map_hindi_numbers_to_arabic is None: if self.model_name in SECOND_GEN_MODELS: self.map_hindi_numbers_to_arabic = True else: self.map_hindi_numbers_to_arabic = False else: self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic def preprocess(self, text: str) -> str: """ Preprocess takes an input text line an applies the same preprocessing used in AraBERT pretraining, or according to settings Args: text (:obj:`str`): inout text string Returns: string: A preprocessed string depending on which model was selected """ if (self.model_name == "bert-base-arabert" or self.model_name == "bert-base-arabertv01"): return self._preprocess_v1( text, do_farasa_tokenization=self.apply_farasa_segmentation, ) if self.model_name in SECOND_GEN_MODELS: return self._preprocess_v2(text) return self._preprocess_v3(text) def unpreprocess(self, text: str, desegment: bool = True) -> str: """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces. The objective is to make the generated text of any model appear natural and not preprocessed. Args: text (:obj:`str`): input text to be un-preprocessed desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before].. Returns: str: The unpreprocessed (and possibly Farasa-desegmented) text. """ if self.apply_farasa_segmentation and desegment: text = self.desegment(text) # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple # https://stackoverflow.com/a/53436792/5381220 text = re.sub(WHITE_SPACED_DOUBLE_QUOTATION_REGEX, '"' + r"\1" + '"', text) text = re.sub(WHITE_SPACED_SINGLE_QUOTATION_REGEX, "'" + r"\1" + "'", text) text = re.sub(WHITE_SPACED_BACK_QUOTATION_REGEX, "\`" + r"\1" + "\`", text) text = re.sub(WHITE_SPACED_EM_DASH, "\—" + r"\1" + "\—", text) # during generation, sometimes the models don't put a space after the dot, this handles it text = text.replace(".", " . ") text = " ".join(text.split()) # handle decimals text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text) text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text) text = re.sub(LEFT_AND_RIGHT_SPACED_CHARS, r"\1", text) text = re.sub(LEFT_SPACED_CHARS, r"\1", text) text = re.sub(RIGHT_SPACED_CHARS, r"\1", text) return text def desegment(self, text: str) -> str: """ Use this function if sentence tokenization was done using `from arabert.preprocess_arabert import preprocess` with Farasa enabled AraBERT segmentation using Farasa adds a space after the '+' for prefixes, and after before the '+' for suffixes Example: >>> desegment('ال+ دراس +ات') الدراسات """ text = text.replace("+ ", "+") text = text.replace(" +", "+") text = " ".join( [self._desegmentword(word) for word in text.split(" ")]) return text def _desegmentword(self, orig_word: str) -> str: """ Word segmentor that takes a Farasa Segmented Word and removes the '+' signs Example: >>> _desegmentword("ال+يومي+ة") اليومية """ word = orig_word.replace("ل+ال+", "لل") if "ال+ال" not in orig_word: word = word.replace("ل+ال", "لل") word = word.replace("+", "") word = word.replace("للل", "لل") return word def _preprocess_v3(self, text: str) -> str: text = str(text) text = html.unescape(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) if self.strip_tatweel: text = araby.strip_tatweel(text) if self.replace_urls_emails_mentions: # replace all possible URLs for reg in URL_REGEXES: text = re.sub(reg, " [رابط] ", text) # REplace Emails with [بريد] for reg in EMAIL_REGEXES: text = re.sub(reg, " [بريد] ", text) # replace mentions with [مستخدم] text = re.sub(USER_MENTION_REGEX, " [مستخدم] ", text) if self.remove_html_markup: # remove html line breaks text = re.sub("<br />", " ", text) # remove html markup text = re.sub("</?[^>]+>", " ", text) if self.map_hindi_numbers_to_arabic: text = text.translate(HINDI_TO_ARABIC_MAP) # remove repeated characters >2 if self.remove_non_digit_repetition: text = self._remove_non_digit_repetition(text) # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z ])", r" \1 ", text, ) # re-fix brackets text = text.replace("[ رابط ]", "[رابط]") text = text.replace("[ بريد ]", "[بريد]") text = text.replace("[ مستخدم ]", "[مستخدم]") # insert whitespace between words and numbers or numbers and words text = re.sub( "(\d+)([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)", r" \1 \2 ", text, ) text = re.sub( "([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)(\d+)", r" \1 \2 ", text, ) # remove unwanted characters text = re.sub(self.REJECTED_CHARS_REGEX, " ", text) # remove extra spaces text = " ".join(text.replace("\uFE0F", "").split()) if self.apply_farasa_segmentation: if self.keep_emojis: new_text = [] for word in text.split(): if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): new_text.append(word) else: new_text.append(self.farasa_segmenter.segment(word)) text = " ".join(new_text) else: text = self.farasa_segmenter.segment(text) return self._farasa_segment(text) # ALl the other models dont require Farasa Segmentation return text def _preprocess_v2(self, text: str) -> str: text = str(text) text = html.unescape(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) if self.strip_tatweel: text = araby.strip_tatweel(text) if self.replace_urls_emails_mentions: # replace all possible URLs for reg in URL_REGEXES: text = re.sub(reg, " [رابط] ", text) # REplace Emails with [بريد] for reg in EMAIL_REGEXES: text = re.sub(reg, " [بريد] ", text) # replace mentions with [مستخدم] text = re.sub(USER_MENTION_REGEX, " [مستخدم] ", text) if self.remove_html_markup: # remove html line breaks text = re.sub("<br />", " ", text) # remove html markup text = re.sub("</?[^>]+>", " ", text) if self.map_hindi_numbers_to_arabic: text = text.translate(HINDI_TO_ARABIC_MAP) # remove repeated characters >2 if self.remove_non_digit_repetition: text = self._remove_non_digit_repetition(text) # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])", r" \1 ", text, ) # insert whitespace between words and numbers or numbers and words text = re.sub("(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text) text = re.sub("([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text) if self.replace_slash_with_dash: text = text.replace("/", "-") # remove unwanted characters text = re.sub(self.REJECTED_CHARS_REGEX, " ", text) # remove extra spaces text = " ".join(text.replace("\uFE0F", "").split()) if (self.model_name == "bert-base-arabertv2" or self.model_name == "bert-large-arabertv2"): if self.keep_emojis: new_text = [] for word in text.split(): if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): new_text.append(word) else: new_text.append(self.farasa_segmenter.segment(word)) text = " ".join(new_text) else: text = self.farasa_segmenter.segment(text) return self._farasa_segment(text) # ALl the other models dont require Farasa Segmentation return text def _preprocess_v1(self, text: str, do_farasa_tokenization: bool) -> str: """ AraBERTv1 preprocessing Function """ text = str(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text) text = re.sub("ـ", "", text) text = re.sub("[«»]", ' " ', text) if self.replace_urls_emails_mentions: # replace the [رابط] token with space if you want to clean links text = re.sub(REGEX_URL_STEP1, "[رابط]", text) text = re.sub(REGEX_URL_STEP2, "[رابط]", text) text = re.sub(REGEX_URL, "[رابط]", text) text = re.sub(REGEX_EMAIL, "[بريد]", text) text = re.sub(REGEX_MENTION, "[مستخدم]", text) text = re.sub("…", r"\.", text).strip() text = self._remove_redundant_punct(text) if self.replace_urls_emails_mentions: text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text) text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text) text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]", " [مستخدم] ", text) if self.remove_non_digit_repetition: text = self._remove_non_digit_repetition(text) if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])", r" \1 ", text, ) if do_farasa_tokenization: text = self._tokenize_arabic_words_farasa(text) text = " ".join(text.split()) return text def _farasa_segment(self, text: str) -> str: line_farasa = text.split() segmented_line = [] for index, word in enumerate(line_farasa): if word in ["[", "]"]: continue if word in ["رابط", "بريد", "مستخدم" ] and line_farasa[index - 1] in [ "[", "]", ]: segmented_line.append("[" + word + "]") continue if "+" not in word: segmented_line.append(word) continue segmented_word = self._split_farasa_output(word) segmented_line.extend(segmented_word) return " ".join(segmented_line) def _split_farasa_output(self, word: str) -> str: segmented_word = [] temp_token = "" for i, c in enumerate(word): if c == "+": # if the token is KAF, it could be a suffix or prefix if temp_token == "ك": # if we are at the second token, then KAF is surely a prefix if i == 1: segmented_word.append(temp_token + "+") temp_token = "" # If the KAF token is between 2 tokens elif word[i - 2] == "+": # if the previous token is prefix, then this KAF must be a prefix if segmented_word[-1][-1] == "+": segmented_word.append(temp_token + "+") temp_token = "" # else it is a suffix, this KAF could not be a second suffix else: segmented_word.append("+" + temp_token) temp_token = "" # if Kaf is at the end, this is handled with the statement after the loop elif temp_token in PREFIX_LIST: segmented_word.append(temp_token + "+") temp_token = "" elif temp_token in SUFFIX_LIST: segmented_word.append("+" + temp_token) temp_token = "" else: segmented_word.append(temp_token) temp_token = "" continue temp_token += c if temp_token != "": if temp_token in SUFFIX_LIST: segmented_word.append("+" + temp_token) else: segmented_word.append(temp_token) return segmented_word def _tokenize_arabic_words_farasa(self, line_input: str) -> str: if self.keep_emojis: # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets line_farasa = [] for word in line_input.split(): if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): line_farasa.append(word) else: line_farasa.append(self.farasa_segmenter.segment(word)) else: line_farasa = self.farasa_segmenter.segment(line_input).split() segmented_line = [] for index, word in enumerate(line_farasa): if word in ["[", "]"]: continue if word in ["رابط", "بريد", "مستخدم" ] and line_farasa[index - 1] in [ "[", "]", ]: segmented_line.append("[" + word + "]") continue segmented_word = [] for token in word.split("+"): if token in PREFIX_LIST: segmented_word.append(token + "+") elif token in SUFFIX_LIST: segmented_word.append("+" + token) else: segmented_word.append(token) segmented_line.extend(segmented_word) return " ".join(segmented_line) def _remove_non_digit_repetition(self, text: str) -> str: """ :param text: the input text to remove elongation :return: delongated text """ # loop over the number of times the regex matched the text # OLD # for index_ in range(len(re.findall(REGEX_TATWEEL, text))): # elongation = re.search(REGEX_TATWEEL, text) # if elongation: # elongation_pattern = elongation.group() # elongation_replacement = elongation_pattern[0] # elongation_pattern = re.escape(elongation_pattern) # text = re.sub( # elongation_pattern, elongation_replacement, text, flags=re.MULTILINE # ) # else: # break # New text = MULTIPLE_CHAR_PATTERN.sub(r"\1\1", text) return text def _remove_redundant_punct(self, text: str) -> str: text_ = text result = re.search(REDUNDANT_PUNCT_PATTERN, text) dif = 0 while result: sub = result.group() sub = sorted(set(sub), key=sub.index) sub = " " + "".join(list(sub)) + " " text = "".join((text[:result.span()[0] + dif], sub, text[result.span()[1] + dif:])) text_ = "".join( (text_[:result.span()[0]], text_[result.span()[1]:])).strip() dif = abs(len(text) - len(text_)) result = re.search(REDUNDANT_PUNCT_PATTERN, text_) text = re.sub(r"\s+", " ", text) return text.strip()
from clean_vi_text import fix_contents import re import tnkeeh as tn from farasa.segmenter import FarasaSegmenter # for bulgarian and turkish from cube.api import Cube en_tok = MosesTokenizer(lang="en") en_normalizer = MosesPunctNormalizer() # TODO: change hardcoding of jar file to a arg from cli rdrsegmenter = VnCoreNLP("vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size="-Xmx500m") ar_segmenter = FarasaSegmenter() bg_cube = Cube(verbose=False) bg_cube.load("bg") tr_cube = Cube(verbose=False) tr_cube.load("tr") def clean_ar_text( text, segment=False, remove_special_chars=False, remove_english=False, normalize=False, remove_diacritics=False,
class ArabertPreprocessor: """ A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo. It also can unprocess the text ouput of the generated text Args: model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Defaults to "bert-base-arabertv02". Current accepted models are: - :obj:`"bert-base-arabertv01"`: No farasa segmentation. - :obj:`"bert-base-arabert"`: with farasa segmentation. - :obj:`"bert-base-arabertv02"`: No farasas egmentation. - :obj:`"bert-base-arabertv2"`: with farasa segmentation. - :obj:`"bert-large-arabertv02"`: No farasas egmentation. - :obj:`"bert-large-arabertv2"`: with farasa segmentation. - :obj:`"araelectra-base"`: No farasa segmentation. - :obj:`"araelectra-base-discriminator"`: No farasa segmentation. - :obj:`"araelectra-base-generator"`: No farasa segmentation. - :obj:`"aragpt2-base"`: No farasa segmentation. - :obj:`"aragpt2-medium"`: No farasa segmentation. - :obj:`"aragpt2-large"`: No farasa segmentation. - :obj:`"aragpt2-mega"`: No farasa segmentation. keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True replace_urls_emails_mentions(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True strip_tashkeel(:obj: `bool`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA) strip_tatweel(:obj: `bool`): remove tatweel '\\u0640' insert_white_spaces(:obj: `bool`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words remove_elongation(:obj: `bool`): replace repetition of more than 2 non-digit character with 2 of this character Returns: ArabertPreprocessor: the preprocessor class Example: from preprocess import ArabertPreprocessor arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2") arabert_prep.preprocess("SOME ARABIC TEXT") """ def __init__( self, model_name, keep_emojis=False, remove_html_markup=True, replace_urls_emails_mentions=True, strip_tashkeel=True, strip_tatweel=True, insert_white_spaces=True, remove_elongation=True, ): """ model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Defaults to "bert-base-arabertv02". Current accepted models are: - :obj:`"bert-base-arabertv01"`: No farasa segmentation. - :obj:`"bert-base-arabert"`: with farasa segmentation. - :obj:`"bert-base-arabertv02"`: No farasas egmentation. - :obj:`"bert-base-arabertv2"`: with farasa segmentation. - :obj:`"bert-large-arabertv02"`: No farasas egmentation. - :obj:`"bert-large-arabertv2"`: with farasa segmentation. - :obj:`"araelectra-base"`: No farasa segmentation. - :obj:`"araelectra-base-discriminator"`: No farasa segmentation. - :obj:`"araelectra-base-generator"`: No farasa segmentation. - :obj:`"aragpt2-base"`: No farasa segmentation. - :obj:`"aragpt2-medium"`: No farasa segmentation. - :obj:`"aragpt2-large"`: No farasa segmentation. - :obj:`"aragpt2-mega"`: No farasa segmentation. keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True replace_urls_emails_mentions(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True strip_tashkeel(:obj: `bool`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA) strip_tatweel(:obj: `bool`): remove tatweel '\\u0640' insert_white_spaces(:obj: `bool`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words remove_elongation(:obj: `bool`): replace repetition of more than 2 non-digit character with 2 of this character """ model_name = model_name.replace("aubmindlab/", "") if model_name not in ACCEPTED_MODELS: logging.warning( "Model provided is not in the accepted model list. Assuming you don't want Farasa Segmentation" ) self.model_name = "bert-base-arabertv02" else: self.model_name = model_name if self.model_name in SEGMENTED_MODELS: logging.info( "Selected Model requires pre-segmentation, Initializing FarasaSegmenter" ) try: from farasa.segmenter import FarasaSegmenter self.farasa_segmenter = FarasaSegmenter(interactive=True) except: logging.warning( "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy" ) else: logging.info( "Selected Model doesn't require pre-segmentation, skipping FarasaSegmenter initialization" ) self.keep_emojis = keep_emojis if self.keep_emojis: import emoji self.emoji = emoji if self.model_name in SEGMENTED_MODELS: logging.warning( "Keeping tweets with Farasa Segmentation is 10 times slower" ) self.remove_html_markup = remove_html_markup self.replace_urls_emails_mentions = replace_urls_emails_mentions self.strip_tashkeel = strip_tashkeel self.strip_tatweel = strip_tatweel self.insert_white_spaces = insert_white_spaces self.remove_elongation = remove_elongation def preprocess(self, text): """ Preprocess takes an input text line an applies the same preprocessing used in AraBERT pretraining Args: text (:obj:`str`): inout text string Returns: string: A preprocessed string depending on which model was selected """ if self.model_name == "bert-base-arabert": return self._old_preprocess( text, do_farasa_tokenization=True, ) if self.model_name == "bert-base-arabertv01": return self._old_preprocess(text, do_farasa_tokenization=False) text = str(text) text = html.unescape(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) if self.strip_tatweel: text = araby.strip_tatweel(text) if self.replace_urls_emails_mentions: # replace all possible URLs for reg in url_regexes: text = re.sub(reg, " [رابط] ", text) # REplace Emails with [بريد] for reg in email_regexes: text = re.sub(reg, " [بريد] ", text) # replace mentions with [مستخدم] text = re.sub(user_mention_regex, " [مستخدم] ", text) if self.remove_html_markup: # remove html line breaks text = re.sub("<br />", " ", text) # remove html markup text = re.sub("</?[^>]+>", " ", text) # remove repeated characters >2 if self.remove_elongation: text = self._remove_elongation(text) # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])", r" \1 ", text, ) # insert whitespace between words and numbers or numbers and words text = re.sub("(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text) text = re.sub("([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text) # remove unwanted characters if self.keep_emojis: emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys())) rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex) text = re.sub(rejected_chars_regex2, " ", text) else: text = re.sub(rejected_chars_regex, " ", text) # remove extra spaces text = " ".join(text.replace("\uFE0F", "").split()) if (self.model_name == "bert-base-arabertv2" or self.model_name == "bert-large-arabertv2"): if self.keep_emojis: new_text = [] for word in text.split(): if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): new_text.append(word) else: new_text.append(self.farasa_segmenter.segment(word)) text = " ".join(new_text) else: text = self.farasa_segmenter.segment(text) return self._farasa_segment(text) # ALl the other models dont require Farasa Segmentation return text def unpreprocess(self, text, desegment=True): """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces. The objective is to make the generated text of any model appear natural and not preprocessed. Args: text (str): input text to be un-preprocessed desegment (bool, optional): [whether or not to remove farasa pre-segmentation before]. Defaults to True. Returns: str: The unpreprocessed (and possibly Farasa-desegmented) text. """ if self.model_name in SEGMENTED_MODELS and desegment: text = self.desegment(text) # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple # https://stackoverflow.com/a/53436792/5381220 text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text) text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text) text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text) text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text) # during generation, sometimes the models don't put a space after the dot, this handles it text = text.replace(".", " . ") text = " ".join(text.split()) # handle decimals text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text) text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text) text = re.sub(left_and_right_spaced_chars, r"\1", text) text = re.sub(left_spaced_chars, r"\1", text) text = re.sub(right_spaced_chars, r"\1", text) return text def desegment(self, text): """ Use this function if sentence tokenization was done using `from arabert.preprocess_arabert import preprocess` with Farasa enabled AraBERT segmentation using Farasa adds a space after the '+' for prefixes, and after before the '+' for suffixes Example: >>> desegment('ال+ دراس +ات') الدراسات """ text = text.replace("+ ", "+") text = text.replace(" +", "+") text = " ".join( [self._desegmentword(word) for word in text.split(" ")]) return text def _desegmentword(self, orig_word: str) -> str: """ Word segmentor that takes a Farasa Segmented Word and removes the '+' signs Example: >>> _desegmentword("ال+يومي+ة") اليومية """ word = orig_word.replace("ل+ال+", "لل") if "ال+ال" not in orig_word: word = word.replace("ل+ال", "لل") word = word.replace("+", "") word = word.replace("للل", "لل") return word def _old_preprocess(self, text, do_farasa_tokenization): """ AraBERTv1 preprocessing Function """ text = str(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text) text = re.sub("ـ", "", text) text = re.sub("[«»]", ' " ', text) if self.replace_urls_emails_mentions: # replace the [رابط] token with space if you want to clean links text = re.sub(regex_url_step1, "[رابط]", text) text = re.sub(regex_url_step2, "[رابط]", text) text = re.sub(regex_url, "[رابط]", text) text = re.sub(regex_email, "[بريد]", text) text = re.sub(regex_mention, "[مستخدم]", text) text = re.sub("…", r"\.", text).strip() text = self._remove_redundant_punct(text) if self.replace_urls_emails_mentions: text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text) text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text) text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]", " [مستخدم] ", text) if self.remove_elongation: text = self._remove_elongation(text) if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])", r" \1 ", text, ) if do_farasa_tokenization: text = self._tokenize_arabic_words_farasa(text) return text.strip() def _farasa_segment(self, text): line_farasa = text.split() segmented_line = [] for index, word in enumerate(line_farasa): if word in ["[", "]"]: continue if word in ["رابط", "بريد", "مستخدم" ] and line_farasa[index - 1] in [ "[", "]", ]: segmented_line.append("[" + word + "]") continue if "+" not in word: segmented_line.append(word) continue segmented_word = self._split_farasa_output(word) segmented_line.extend(segmented_word) return " ".join(segmented_line) def _split_farasa_output(self, word): segmented_word = [] temp_token = "" for i, c in enumerate(word): if c == "+": # if the token is KAF, it could be a suffix or prefix if temp_token == "ك": # if we are at the second token, then KAF is surely a prefix if i == 1: segmented_word.append(temp_token + "+") temp_token = "" # If the KAF token is between 2 tokens elif word[i - 2] == "+": # if the previous token is prefix, then this KAF must be a prefix if segmented_word[-1][-1] == "+": segmented_word.append(temp_token + "+") temp_token = "" # else it is a suffix, this KAF could not be a second suffix else: segmented_word.append("+" + temp_token) temp_token = "" # if Kaf is at the end, this is handled with the statement after the loop elif temp_token in prefix_list: segmented_word.append(temp_token + "+") temp_token = "" elif temp_token in suffix_list: segmented_word.append("+" + temp_token) temp_token = "" else: segmented_word.append(temp_token) temp_token = "" continue temp_token += c if temp_token != "": if temp_token in suffix_list: segmented_word.append("+" + temp_token) else: segmented_word.append(temp_token) return segmented_word def _tokenize_arabic_words_farasa(self, line_input): if self.keep_emojis: # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets line_farasa = [] for word in line_input.split(): if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): line_farasa.append(word) else: line_farasa.append(self.farasa_segmenter.segment(word)) else: line_farasa = self.farasa_segmenter.segment(line_input).split() segmented_line = [] for index, word in enumerate(line_farasa): if word in ["[", "]"]: continue if word in ["رابط", "بريد", "مستخدم" ] and line_farasa[index - 1] in [ "[", "]", ]: segmented_line.append("[" + word + "]") continue segmented_word = [] for token in word.split("+"): if token in prefix_list: segmented_word.append(token + "+") elif token in suffix_list: segmented_word.append("+" + token) else: segmented_word.append(token) segmented_line.extend(segmented_word) return " ".join(segmented_line) def _remove_elongation(self, text): """ :param text: the input text to remove elongation :return: delongated text """ # loop over the number of times the regex matched the text for index_ in range(len(re.findall(regex_tatweel, text))): elongation = re.search(regex_tatweel, text) if elongation: elongation_pattern = elongation.group() elongation_replacement = elongation_pattern[0] elongation_pattern = re.escape(elongation_pattern) text = re.sub(elongation_pattern, elongation_replacement, text, flags=re.MULTILINE) else: break return text def _remove_redundant_punct(self, text): text_ = text result = re.search(redundant_punct_pattern, text) dif = 0 while result: sub = result.group() sub = sorted(set(sub), key=sub.index) sub = " " + "".join(list(sub)) + " " text = "".join((text[:result.span()[0] + dif], sub, text[result.span()[1] + dif:])) text_ = "".join( (text_[:result.span()[0]], text_[result.span()[1]:])).strip() dif = abs(len(text) - len(text_)) result = re.search(redundant_punct_pattern, text_) text = re.sub(r"\s+", " ", text) return text.strip()
class ArabertPreprocessor: """ A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo. Args: model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Current accepted models are: - :obj:`"bert-base-arabertv01"`: No farasa segmentation. - :obj:`"bert-base-arabert"`: with farasa segmentation. - :obj:`"bert-base-arabertv02"`: No farasas egmentation. - :obj:`"bert-base-arabertv2"`: with farasa segmentation. - :obj:`"bert-large-arabertv02"`: No farasas egmentation. - :obj:`"bert-large-arabertv2"`: with farasa segmentation. - :obj:`"araelectra-base"`: No farasa segmentation. - :obj:`"aragpt2-base"`: No farasa segmentation. - :obj:`"aragpt2-medium"`: No farasa segmentation. - :obj:`"aragpt2-large"`: No farasa segmentation. - :obj:`"aragpt2-mega"`: No farasa segmentation. keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False Returns: ArabertPreprocessor: the preprocessor class Example: from preprocess import ArabertPreprocessor arabert_prep = ArabertPreprocessor("bert-base-arabertv2",keep_emojis=False) arabert_prep.preprocess("SOME ARABIC TEXT") """ def __init__(self, model_name, keep_emojis=False, remove_html_markup=True, replace_urls_emails_mentions=True): """ model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Current accepted models are: - :obj:`"bert-base-arabertv01"`: No farasa segmentation. - :obj:`"bert-base-arabert"`: with farasa segmentation. - :obj:`"bert-base-arabertv02"`: No farasas egmentation. - :obj:`"bert-base-arabertv2"`: with farasa segmentation. - :obj:`"bert-large-arabertv02"`: No farasas egmentation. - :obj:`"bert-large-arabertv2"`: with farasa segmentation. - :obj:`"araelectra-base"`: No farasa segmentation. - :obj:`"aragpt2-base"`: No farasa segmentation. - :obj:`"aragpt2-medium"`: No farasa segmentation. - :obj:`"aragpt2-large"`: No farasa segmentation. - :obj:`"aragpt2-mega"`: No farasa segmentation. keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True remove_html_markup(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True """ model_name = model_name.replace("aubmindlab/", "") if model_name not in ACCEPTED_MODELS: logging.warning( "Model provided is not in the accepted model list. Assuming you don't want Farasa Segmentation" ) self.model_name = "bert-base-arabertv02" else: self.model_name = model_name if self.model_name in SEGMENTED_MODELS: logging.info( "Selected Model requires pre-segmentation, Initializing FarasaSegmenter" ) try: from farasa.segmenter import FarasaSegmenter self.farasa_segmenter = FarasaSegmenter(interactive=True) except: logging.warning( "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy" ) else: logging.info( "Selected Model doesn't require pre-segmentation, skipping FarasaSegmenter initialization" ) self.keep_emojis = keep_emojis if self.keep_emojis: import emoji self.emoji = emoji if self.model_name in SEGMENTED_MODELS: logging.warning( "Keeping tweets with Farasa Segmentation is 10 times slower" ) self.remove_html_markup = remove_html_markup self.replace_urls_emails_mentions = replace_urls_emails_mentions def preprocess(self, text): """ Preprocess takes an input text line an applies the same preprocessing used in AraBERT pretraining Args: text (:obj:`str`): inout text string Returns: string: A preprocessed string depending on which model was selected """ if self.model_name == "bert-base-arabert": return self._old_preprocess( text, do_farasa_tokenization=True, ) if self.model_name == "bert-base-arabertv01": return self._old_preprocess(text, do_farasa_tokenization=False) text = str(text) text = html.unescape(text) text = araby.strip_tashkeel(text) text = araby.strip_tatweel(text) if self.replace_urls_emails_mentions: # replace all possible URLs for reg in url_regexes: text = re.sub(reg, " [رابط] ", text) # REplace Emails with [بريد] for reg in email_regexes: text = re.sub(reg, " [بريد] ", text) # replace mentions with [مستخدم] text = re.sub(user_mention_regex, " [مستخدم] ", text) if self.remove_html_markup: # remove html line breaks text = re.sub("<br />", " ", text) # remove html markup text = re.sub("</?[^>]+>", " ", text) # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets text = re.sub( "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])", r" \1 ", text) # insert whitespace between words and numbers or numbers and words text = re.sub("(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text) text = re.sub("([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text) # remove unwanted characters if self.keep_emojis: emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI.keys())) rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex) text = re.sub(rejected_chars_regex2, " ", text) else: text = re.sub(rejected_chars_regex, " ", text) # remove repeated characters >2 #text = self._remove_elongation(text) # remove extra spaces text = " ".join(text.replace("\uFE0F", "").split()) if self.model_name == "bert-base-arabertv2" or self.model_name == "bert-large-arabertv2": if self.keep_emojis: new_text = [] for word in text.split(): if word in list(self.emoji.UNICODE_EMOJI.keys()): new_text.append(word) else: new_text.append(self.farasa_segmenter.segment(word)) text = " ".join(new_text) else: text = self.farasa_segmenter.segment(text) return self._farasa_segment(text) # ALl the other models dont require Farasa Segmentation return text def _old_preprocess(self, text, do_farasa_tokenization): """ AraBERTv1 preprocessing Function """ text = str(text) text = araby.strip_tashkeel(text) text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text) text = re.sub("ـ", "", text) text = re.sub("[«»]", ' " ', text) if self.replace_urls_emails_mentions: # replace the [رابط] token with space if you want to clean links text = re.sub(regex_url_step1, "[رابط]", text) text = re.sub(regex_url_step2, "[رابط]", text) text = re.sub(regex_url, "[رابط]", text) text = re.sub(regex_email, "[بريد]", text) text = re.sub(regex_mention, "[مستخدم]", text) text = re.sub("…", r"\.", text).strip() text = self._remove_redundant_punct(text) if self.replace_urls_emails_mentions: text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text) text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text) text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]", " [مستخدم] ", text) #text = self._remove_elongation(text) text = re.sub( "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])", r" \1 ", text) if do_farasa_tokenization: text = self._tokenize_arabic_words_farasa(text) return text.strip() def _farasa_segment(self, text): line_farasa = text.split() segmented_line = [] for index, word in enumerate(line_farasa): if word in ["[", "]"]: continue if word in ["رابط", "بريد", "مستخدم" ] and line_farasa[index - 1] in ["[", "]"]: segmented_line.append("[" + word + "]") continue if "+" not in word: segmented_line.append(word) continue segmented_word = self._split_farasa_output(word) segmented_line.extend(segmented_word) return " ".join(segmented_line) def _split_farasa_output(self, word): segmented_word = [] temp_token = "" for i, c in enumerate(word): if c == "+": # if the token is KAF, it could be a suffix or prefix if temp_token == "ك": # if we are at the second token, then KAF is surely a prefix if i == 1: segmented_word.append(temp_token + "+") temp_token = "" # If the KAF token is between 2 tokens elif word[i - 2] == "+": # if the previous token is prefix, then this KAF must be a prefix if segmented_word[-1][-1] == "+": segmented_word.append(temp_token + "+") temp_token = "" # else it is a suffix, this KAF could not be a second suffix else: segmented_word.append("+" + temp_token) temp_token = "" # if Kaf is at the end, this is handled with the statement after the loop elif temp_token in prefix_list: segmented_word.append(temp_token + "+") temp_token = "" elif temp_token in suffix_list: segmented_word.append("+" + temp_token) temp_token = "" else: segmented_word.append(temp_token) temp_token = "" continue temp_token += c if temp_token != "": if temp_token in suffix_list: segmented_word.append("+" + temp_token) else: segmented_word.append(temp_token) return segmented_word def _tokenize_arabic_words_farasa(self, line_input): if self.keep_emojis: # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets line_farasa = [] for word in line_input.split(): if word in list(self.emoji.UNICODE_EMOJI.keys()): line_farasa.append(word) else: line_farasa.append(self.farasa_segmenter.segment(word)) else: line_farasa = self.farasa_segmenter.segment(line_input).split() segmented_line = [] for index, word in enumerate(line_farasa): if word in ["[", "]"]: continue if word in ["رابط", "بريد", "مستخدم" ] and line_farasa[index - 1] in ["[", "]"]: segmented_line.append("[" + word + "]") continue segmented_word = [] for token in word.split("+"): if token in prefix_list: segmented_word.append(token + "+") elif token in suffix_list: segmented_word.append("+" + token) else: segmented_word.append(token) segmented_line.extend(segmented_word) return " ".join(segmented_line) def _remove_elongation(self, word): """ :param word: the input word to remove elongation :return: delongated word """ # loop over the number of times the regex matched the word for index_ in range(len(re.findall(regex_tatweel, word))): if re.search(regex_tatweel, word): elongation_found = re.search(regex_tatweel, word) elongation_replacement = elongation_found.group()[0] elongation_pattern = elongation_found.group() word = re.sub(elongation_pattern, elongation_replacement, word, flags=re.MULTILINE) else: break return word def _remove_redundant_punct(self, text): text_ = text result = re.search(redundant_punct_pattern, text) dif = 0 while result: sub = result.group() sub = sorted(set(sub), key=sub.index) sub = " " + "".join(list(sub)) + " " text = "".join((text[:result.span()[0] + dif], sub, text[result.span()[1] + dif:])) text_ = "".join( (text_[:result.span()[0]], text_[result.span()[1]:])).strip() dif = abs(len(text) - len(text_)) result = re.search(redundant_punct_pattern, text_) text = re.sub(r"\s+", " ", text) return text.strip()
def __init__( self, model_name, keep_emojis=False, remove_html_markup=True, replace_urls_emails_mentions=True, strip_tashkeel=True, strip_tatweel=True, insert_white_spaces=True, remove_elongation=True, ): """ model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Defaults to "bert-base-arabertv02". Current accepted models are: - :obj:`"bert-base-arabertv01"`: No farasa segmentation. - :obj:`"bert-base-arabert"`: with farasa segmentation. - :obj:`"bert-base-arabertv02"`: No farasas egmentation. - :obj:`"bert-base-arabertv2"`: with farasa segmentation. - :obj:`"bert-large-arabertv02"`: No farasas egmentation. - :obj:`"bert-large-arabertv2"`: with farasa segmentation. - :obj:`"araelectra-base"`: No farasa segmentation. - :obj:`"araelectra-base-discriminator"`: No farasa segmentation. - :obj:`"araelectra-base-generator"`: No farasa segmentation. - :obj:`"aragpt2-base"`: No farasa segmentation. - :obj:`"aragpt2-medium"`: No farasa segmentation. - :obj:`"aragpt2-large"`: No farasa segmentation. - :obj:`"aragpt2-mega"`: No farasa segmentation. keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True replace_urls_emails_mentions(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True strip_tashkeel(:obj: `bool`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA) strip_tatweel(:obj: `bool`): remove tatweel '\\u0640' insert_white_spaces(:obj: `bool`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words remove_elongation(:obj: `bool`): replace repetition of more than 2 non-digit character with 2 of this character """ model_name = model_name.replace("aubmindlab/", "") if model_name not in ACCEPTED_MODELS: logging.warning( "Model provided is not in the accepted model list. Assuming you don't want Farasa Segmentation" ) self.model_name = "bert-base-arabertv02" else: self.model_name = model_name if self.model_name in SEGMENTED_MODELS: logging.info( "Selected Model requires pre-segmentation, Initializing FarasaSegmenter" ) try: from farasa.segmenter import FarasaSegmenter self.farasa_segmenter = FarasaSegmenter(interactive=True) except: logging.warning( "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy" ) else: logging.info( "Selected Model doesn't require pre-segmentation, skipping FarasaSegmenter initialization" ) self.keep_emojis = keep_emojis if self.keep_emojis: import emoji self.emoji = emoji if self.model_name in SEGMENTED_MODELS: logging.warning( "Keeping tweets with Farasa Segmentation is 10 times slower" ) self.remove_html_markup = remove_html_markup self.replace_urls_emails_mentions = replace_urls_emails_mentions self.strip_tashkeel = strip_tashkeel self.strip_tatweel = strip_tatweel self.insert_white_spaces = insert_white_spaces self.remove_elongation = remove_elongation
class ArabertPreprocessor: """ A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo. It also can unprocess the text ouput of the generated text Args: model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Will default to a base Arabic preprocessor if model name was not found. Current accepted models are: Returns: ArabertPreprocessor: A preprocessor instance Example: from preprocess import ArabertPreprocessor arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2") arabert_prep.preprocess("SOME ARABIC TEXT") """ def __init__( self, model_name: str, keep_emojis: bool = False, remove_html_markup: bool = True, replace_urls_emails_mentions: bool = True, strip_tashkeel: bool = True, strip_tatweel: bool = True, insert_white_spaces: bool = True, remove_non_digit_repetition: bool = True, replace_slash_with_dash: bool = None, map_hindi_numbers_to_arabic: bool = None, apply_farasa_segmentation: bool = None, ): model_name = model_name.replace("Ebtihal/", "").replace("ebtihalaziz/", "") if model_name not in ACCEPTED_MODELS: logging.warning( """Model provided is not in the accepted model list. Preprocessor will default to a base Arabic preprocessor""" ) self.model_name = "Ebtihal/AraDiaBERTo_V2" else: self.model_name = model_name if apply_farasa_segmentation is None: if self.model_name in SEGMENTED_MODELS: self.apply_farasa_segmentation = True else: self.apply_farasa_segmentation = False else: if apply_farasa_segmentation == False and self.model_name in SEGMENTED_MODELS: logging.warning( "The selected model_name requires Farasa pre-segmentation, but apply_farasa_segmentation was set to False!" ) self.apply_farasa_segmentation = apply_farasa_segmentation if self.apply_farasa_segmentation: try: from farasa.segmenter import FarasaSegmenter self.farasa_segmenter = FarasaSegmenter(interactive=True) except ModuleNotFoundError: logging.error( "farasapy is not installed, you want be able to process text for AraBERTv1 and v2. Install it using: pip install farasapy" ) self.keep_emojis = keep_emojis if self.keep_emojis: import emoji self.emoji = emoji if self.apply_farasa_segmentation: logging.warning( "Keeping tweets with Farasa Segmentation is 10 times slower" ) self.remove_html_markup = remove_html_markup self.replace_urls_emails_mentions = replace_urls_emails_mentions self.strip_tashkeel = strip_tashkeel self.strip_tatweel = strip_tatweel self.insert_white_spaces = insert_white_spaces self.remove_non_digit_repetition = remove_non_digit_repetition if replace_slash_with_dash is None: if self.model_name in SECOND_GEN_MODELS: self.replace_slash_with_dash = True else: self.replace_slash_with_dash = False else: self.replace_slash_with_dash = replace_slash_with_dash if map_hindi_numbers_to_arabic is None: if self.model_name in SECOND_GEN_MODELS: self.map_hindi_numbers_to_arabic = True else: self.map_hindi_numbers_to_arabic = False else: self.map_hindi_numbers_to_arabic = map_hindi_numbers_to_arabic def preprocess(self, text: str) -> str: """ Preprocess takes an input text line an applies the same preprocessing used in AraBERT pretraining, or according to settings Args: text (:obj:`str`): inout text string Returns: string: A preprocessed string depending on which model was selected """ if ( self.model_name == "bert-base-arabert" or self.model_name == "bert-base-arabertv01" ): return self._preprocess_v1( text, do_farasa_tokenization=self.apply_farasa_segmentation, ) if self.model_name in SECOND_GEN_MODELS: return self._preprocess_v2(text) return self._preprocess_v3(text) def unpreprocess(self, text: str, desegment: bool = True) -> str: """Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces. The objective is to make the generated text of any model appear natural and not preprocessed. Args: text (:obj:`str`): input text to be un-preprocessed desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before].. Returns: str: The unpreprocessed (and possibly Farasa-desegmented) text. """ if self.apply_farasa_segmentation and desegment: text = self.desegment(text) # removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple # https://stackoverflow.com/a/53436792/5381220 text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text) text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text) text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text) text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text) # during generation, sometimes the models don't put a space after the dot, this handles it text = text.replace(".", " . ") text = " ".join(text.split()) # handle decimals text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text) text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text) text = re.sub(left_and_right_spaced_chars, r"\1", text) text = re.sub(left_spaced_chars, r"\1", text) text = re.sub(right_spaced_chars, r"\1", text) return text def desegment(self, text: str) -> str: """ Use this function if sentence tokenization was done using `from arabert.preprocess_arabert import preprocess` with Farasa enabled AraBERT segmentation using Farasa adds a space after the '+' for prefixes, and after before the '+' for suffixes Example: >>> desegment('ال+ دراس +ات') الدراسات """ text = text.replace("+ ", "+") text = text.replace(" +", "+") text = " ".join([self._desegmentword(word) for word in text.split(" ")]) return text def _desegmentword(self, orig_word: str) -> str: """ Word segmentor that takes a Farasa Segmented Word and removes the '+' signs Example: >>> _desegmentword("ال+يومي+ة") اليومية """ word = orig_word.replace("ل+ال+", "لل") if "ال+ال" not in orig_word: word = word.replace("ل+ال", "لل") word = word.replace("+", "") word = word.replace("للل", "لل") return word def _preprocess_v3(self, text: str) -> str: text = str(text) text = html.unescape(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) if self.strip_tatweel: text = araby.strip_tatweel(text) if self.replace_urls_emails_mentions: # replace all possible URLs for reg in url_regexes: text = re.sub(reg, " [رابط] ", text) # REplace Emails with [بريد] for reg in email_regexes: text = re.sub(reg, " [بريد] ", text) # replace mentions with [مستخدم] text = re.sub(user_mention_regex, " [مستخدم] ", text) if self.remove_html_markup: # remove html line breaks text = re.sub("<br />", " ", text) # remove html markup text = re.sub("</?[^>]+>", " ", text) if self.map_hindi_numbers_to_arabic: text = text.translate(hindi_to_arabic_map) # remove repeated characters >2 if self.remove_non_digit_repetition: text = self._remove_non_digit_repetition(text) # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z ])", r" \1 ", text, ) # re-fix brackets text = text.replace("[ رابط ]", "[رابط]") text = text.replace("[ بريد ]", "[بريد]") text = text.replace("[ مستخدم ]", "[مستخدم]") # insert whitespace between words and numbers or numbers and words text = re.sub( "(\d+)([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)", r" \1 \2 ", text, ) text = re.sub( "([\u0621-\u063A\u0641-\u064A\u066A-\u066C\u0654-\u0655]+)(\d+)", r" \1 \2 ", text, ) # remove unwanted characters if self.keep_emojis: emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys())) rejected_chars_regex2 = "[^%s%s]" % (chars_regexv2, emoji_regex) text = re.sub(rejected_chars_regex2, " ", text) else: text = re.sub(rejected_chars_regexv2, " ", text) # remove extra spaces text = " ".join(text.replace("\uFE0F", "").split()) if self.apply_farasa_segmentation: if self.keep_emojis: new_text = [] for word in text.split(): if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): new_text.append(word) else: new_text.append(self.farasa_segmenter.segment(word)) text = " ".join(new_text) else: text = self.farasa_segmenter.segment(text) return self._farasa_segment(text) # ALl the other models dont require Farasa Segmentation return text def _preprocess_v2(self, text: str) -> str: text = str(text) text = html.unescape(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) if self.strip_tatweel: text = araby.strip_tatweel(text) if self.replace_urls_emails_mentions: # replace all possible URLs for reg in url_regexes: text = re.sub(reg, " [رابط] ", text) # REplace Emails with [بريد] for reg in email_regexes: text = re.sub(reg, " [بريد] ", text) # replace mentions with [مستخدم] text = re.sub(user_mention_regex, " [مستخدم] ", text) if self.remove_html_markup: # remove html line breaks text = re.sub("<br />", " ", text) # remove html markup text = re.sub("</?[^>]+>", " ", text) if self.map_hindi_numbers_to_arabic: text = text.translate(hindi_to_arabic_map) # remove repeated characters >2 if self.remove_non_digit_repetition: text = self._remove_non_digit_repetition(text) # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])", r" \1 ", text, ) # insert whitespace between words and numbers or numbers and words text = re.sub( "(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text ) text = re.sub( "([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text ) if self.replace_slash_with_dash: text = text.replace("/", "-") # remove unwanted characters if self.keep_emojis: emoji_regex = "".join(list(self.emoji.UNICODE_EMOJI["en"].keys())) rejected_chars_regex2 = "[^%s%s]" % (chars_regex, emoji_regex) text = re.sub(rejected_chars_regex2, " ", text) else: text = re.sub(rejected_chars_regex, " ", text) # remove extra spaces text = " ".join(text.replace("\uFE0F", "").split()) if ( self.model_name == "bert-base-arabertv2" or self.model_name == "bert-large-arabertv2" ): if self.keep_emojis: new_text = [] for word in text.split(): if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): new_text.append(word) else: new_text.append(self.farasa_segmenter.segment(word)) text = " ".join(new_text) else: text = self.farasa_segmenter.segment(text) return self._farasa_segment(text) # ALl the other models dont require Farasa Segmentation return text def _preprocess_v1(self, text: str, do_farasa_tokenization: bool) -> str: """ AraBERTv1 preprocessing Function """ text = str(text) if self.strip_tashkeel: text = araby.strip_tashkeel(text) text = re.sub(r"\d+\/[ء-ي]+\/\d+\]", "", text) text = re.sub("ـ", "", text) text = re.sub("[«»]", ' " ', text) if self.replace_urls_emails_mentions: # replace the [رابط] token with space if you want to clean links text = re.sub(regex_url_step1, "[رابط]", text) text = re.sub(regex_url_step2, "[رابط]", text) text = re.sub(regex_url, "[رابط]", text) text = re.sub(regex_email, "[بريد]", text) text = re.sub(regex_mention, "[مستخدم]", text) text = re.sub("…", r"\.", text).strip() text = self._remove_redundant_punct(text) if self.replace_urls_emails_mentions: text = re.sub(r"\[ رابط \]|\[ رابط\]|\[رابط \]", " [رابط] ", text) text = re.sub(r"\[ بريد \]|\[ بريد\]|\[بريد \]", " [بريد] ", text) text = re.sub(r"\[ مستخدم \]|\[ مستخدم\]|\[مستخدم \]", " [مستخدم] ", text) if self.remove_non_digit_repetition: text = self._remove_non_digit_repetition(text) if self.insert_white_spaces: text = re.sub( "([^0-9\u0621-\u063A\u0641-\u0669\u0671-\u0673a-zA-Z\[\]])", r" \1 ", text, ) if do_farasa_tokenization: text = self._tokenize_arabic_words_farasa(text) text = " ".join(text.split()) return text def _farasa_segment(self, text: str) -> str: line_farasa = text.split() segmented_line = [] for index, word in enumerate(line_farasa): if word in ["[", "]"]: continue if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [ "[", "]", ]: segmented_line.append("[" + word + "]") continue if "+" not in word: segmented_line.append(word) continue segmented_word = self._split_farasa_output(word) segmented_line.extend(segmented_word) return " ".join(segmented_line) def _split_farasa_output(self, word: str) -> str: segmented_word = [] temp_token = "" for i, c in enumerate(word): if c == "+": # if the token is KAF, it could be a suffix or prefix if temp_token == "ك": # if we are at the second token, then KAF is surely a prefix if i == 1: segmented_word.append(temp_token + "+") temp_token = "" # If the KAF token is between 2 tokens elif word[i - 2] == "+": # if the previous token is prefix, then this KAF must be a prefix if segmented_word[-1][-1] == "+": segmented_word.append(temp_token + "+") temp_token = "" # else it is a suffix, this KAF could not be a second suffix else: segmented_word.append("+" + temp_token) temp_token = "" # if Kaf is at the end, this is handled with the statement after the loop elif temp_token in prefix_list: segmented_word.append(temp_token + "+") temp_token = "" elif temp_token in suffix_list: segmented_word.append("+" + temp_token) temp_token = "" else: segmented_word.append(temp_token) temp_token = "" continue temp_token += c if temp_token != "": if temp_token in suffix_list: segmented_word.append("+" + temp_token) else: segmented_word.append(temp_token) return segmented_word def _tokenize_arabic_words_farasa(self, line_input: str) -> str: if self.keep_emojis: # insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets line_farasa = [] for word in line_input.split(): if word in list(self.emoji.UNICODE_EMOJI["en"].keys()): line_farasa.append(word) else: line_farasa.append(self.farasa_segmenter.segment(word)) else: line_farasa = self.farasa_segmenter.segment(line_input).split() segmented_line = [] for index, word in enumerate(line_farasa): if word in ["[", "]"]: continue if word in ["رابط", "بريد", "مستخدم"] and line_farasa[index - 1] in [ "[", "]", ]: segmented_line.append("[" + word + "]") continue segmented_word = [] for token in word.split("+"): if token in prefix_list: segmented_word.append(token + "+") elif token in suffix_list: segmented_word.append("+" + token) else: segmented_word.append(token) segmented_line.extend(segmented_word) return " ".join(segmented_line) def _remove_non_digit_repetition(self, text: str) -> str: """ :param text: the input text to remove elongation :return: delongated text """ # loop over the number of times the regex matched the text # OLD # for index_ in range(len(re.findall(regex_tatweel, text))): # elongation = re.search(regex_tatweel, text) # if elongation: # elongation_pattern = elongation.group() # elongation_replacement = elongation_pattern[0] # elongation_pattern = re.escape(elongation_pattern) # text = re.sub( # elongation_pattern, elongation_replacement, text, flags=re.MULTILINE # ) # else: # break # New text = multiple_char_pattern.sub(r"\1\1", text) return text def _remove_redundant_punct(self, text: str) -> str: text_ = text result = re.search(redundant_punct_pattern, text) dif = 0 while result: sub = result.group() sub = sorted(set(sub), key=sub.index) sub = " " + "".join(list(sub)) + " " text = "".join( (text[: result.span()[0] + dif], sub, text[result.span()[1] + dif :]) ) text_ = "".join( (text_[: result.span()[0]], text_[result.span()[1] :]) ).strip() dif = abs(len(text) - len(text_)) result = re.search(redundant_punct_pattern, text_) text = re.sub(r"\s+", " ", text) return text.strip()