Exemple #1
0
 def diacritize_original(self, u_text):
     assert isinstance(u_text, str)
     numbers_words = NUMBER_REGEXP.findall(u_text)
     u_text = NUMBER_REGEXP.sub('0', clear_diacritics(u_text))
     segments = WORD_TOKENIZATION_REGEXP.split(u_text)
     valid_segments = [x for x in segments if WORD_TOKENIZATION_REGEXP.match(x)]
     diacritized_valid_words = self.diacritize_processed(' '.join(valid_segments)).split(' ')
     start_index = 0
     for d_word in diacritized_valid_words:
         u_text = u_text[:start_index] + u_text[start_index:].replace(clear_diacritics(d_word), d_word, 1)
         start_index = max(u_text.index(d_word) + len(d_word), start_index + len(d_word))
     for nw in numbers_words:
         u_text = ZERO_REGEXP.sub(nw, u_text, 1)
     return u_text
Exemple #2
0
 def diacritize_processed(self, u_p_text):
     assert isinstance(u_p_text, str)
     text_indices = [CHAR2INDEX[x] for x in u_p_text]
     input = add_time_steps(to_categorical(text_indices, len(CHAR2INDEX)), DiacritizationModel.TIME_STEPS, False)
     shadda_pred, harakat_pred = self.model.predict_on_batch(input)
     shaddat = [NAME2DIACRITIC['Shadda'] if x >= 0.5 else '' for x in shadda_pred]
     harakat = [self.index_to_diacritic(np.argmax(x)) for x in harakat_pred]
     d_words = ('<s> ' + ''.join([l + sh + h for l, sh, h in zip(u_p_text, shaddat, harakat)]) + ' <e>').split(' ')
     correct_words = []
     for prev_word, word, next_word in zip(d_words[:-2], d_words[1:-1], d_words[2:]):
         word_u = clear_diacritics(word)
         prev_word_u = clear_diacritics(prev_word)
         next_word_u = clear_diacritics(next_word)
         try:
             best_word = ''
             max_frequency = 0
             for diacritized_word, frequency in self.trigram_context[prev_word_u, word_u, next_word_u].items():
                 if frequency > max_frequency:
                     max_frequency = frequency
                     best_word = diacritized_word
             word = best_word
         except KeyError:  # undiacritized trigram context was not found for this word
             try:
                 best_word = ''
                 max_frequency = 0
                 for diacritized_word, frequency in self.bigram_context[prev_word_u, word_u].items():
                     if frequency > max_frequency:
                         max_frequency = frequency
                         best_word = diacritized_word
                 word = best_word
             except KeyError:  # undiacritized bigram context was not found for this word
                 try:
                     possible_words = list(self.undiacritized_vocabulary[word_u])
                     distances = [self.levenshtein_distance(word, w_d) for w_d in possible_words]
                     word = possible_words[np.argmin(distances)]
                 except KeyError:  # undiacritized word was not found in the dictionary
                     try:
                         u_pattern = self.convert_to_pattern(word_u)
                         d_pattern = self.convert_to_pattern(word)
                         possible_patterns = list(self.patterns[u_pattern])
                         distances = [self.levenshtein_distance(d_pattern, p_d) for p_d in possible_patterns]
                         best_pattern = possible_patterns[np.argmin(distances)]
                         diacritics = extract_diacritics_2(best_pattern)
                         word = ''
                         for l, d in zip(word_u, diacritics):
                             word += l + (d if len(d) < 2 else d[0] + d[1])
                     except KeyError:
                         pass
         correct_words.append(word)
     return ' '.join(correct_words)
def stat(file_path):
    assert isinstance(file_path, Path)
    chars_count = 0
    arabic_letters_count = 0
    digits_count = 0
    tokens_count = 0
    numbers_count = 0
    arabic_words_count = 0
    diacritics_count = 0
    diacritization_forms = {}
    with file_path.open('r', encoding='UTF-8') as data_file:
        for line in data_file:
            line = line.rstrip('\n')
            segments = [
                x.strip() for x in WORD_TOKENIZATION_REGEXP.split(line)
                if x.strip() != ''
            ]
            for seg in segments:
                tokens_count += 1
                chars_count += len(seg)
                if WORD_TOKENIZATION_REGEXP.match(seg):
                    if NUMBER_REGEXP.match(seg):
                        numbers_count += 1
                        digits_count += sum(1 for x in seg
                                            if x in '0123456789')
                    else:
                        arabic_words_count += 1
                        undiacritized = clear_diacritics(seg)
                        arabic_letters_count += len(undiacritized)
                        if undiacritized != seg:
                            try:
                                diacritization_forms[undiacritized].add(seg)
                            except KeyError:
                                diacritization_forms[undiacritized] = {seg}
                            diacritics_count += len([
                                x for x in extract_diacritics(seg)
                                if x in ARABIC_DIACRITICS
                            ])
    print('Statistics about the dataset:', file_path)
    print('-' * 35)
    print('|Characters         |{:13d}|'.format(chars_count))
    print('|Tokens             |{:13d}|'.format(tokens_count))
    print('|Numbers            |{:13d}|'.format(numbers_count))
    print('|Digits             |{:13d}|'.format(digits_count))
    print('|Arabic words       |{:13d}|'.format(arabic_words_count))
    print('|Arabic letters     |{:13d}|'.format(arabic_letters_count))
    print('|Diacritics         |{:13d}|'.format(diacritics_count))
    print('|Undiacritized forms|{:13d}|'.format(len(diacritization_forms)))
    print('|Diacritized forms  |{:13d}|'.format(
        sum(len(x) for x in diacritization_forms.values())))
    print('-' * 35)
Exemple #4
0
 def generate_dataset(sentences):
     assert isinstance(sentences, Iterable)
     targets = []
     inputs = []
     for sentence in sentences:
         diacritics = extract_diacritics_2(sentence)
         letters_text = clear_diacritics(sentence)
         shadda_positions = []
         harakat_indices = []
         for d in diacritics:
             shadda_positions.append(1 if d and d[0] == NAME2DIACRITIC['Shadda'] else 0)
             harakat_indices.append(DiacritizationModel.diacritic_to_index(d[-1])
                                    if d and d[-1] != NAME2DIACRITIC['Shadda'] else 0)
         text_indices = [CHAR2INDEX[x] for x in letters_text]
         targets.append((shadda_positions, harakat_indices))
         inputs.append(text_indices)
     return inputs, targets
Exemple #5
0
 def der_wer_values(self, test_sentences, limit_to_arabic=True, include_no_diacritic=True):
     correct_d, correct_w, total_d, total_w, correct_dm, correct_wm, total_dm = 0, 0, 0, 0, 0, 0, 0
     logging_indexes = set(int(x / 100 * len(test_sentences)) for x in range(1, 101))
     print('Calculating DER and WER values on {} characters'.format('Arabic' if limit_to_arabic else 'all'))
     print('{} no-diacritic Arabic letters'.format('Including' if include_no_diacritic else 'Ignoring'))
     for i, original_sentence in enumerate(test_sentences, 1):
         predicted_sentence = self.diacritize_original(clear_diacritics(original_sentence))
         for orig_word, pred_word in zip(WORD_TOKENIZATION_REGEXP.split(original_sentence),
                                         WORD_TOKENIZATION_REGEXP.split(predicted_sentence)):
             orig_word, pred_word = orig_word.strip(), pred_word.strip()
             if len(orig_word) == 0 or len(pred_word) == 0:  # Rare problematic scenario
                 continue
             if limit_to_arabic:
                 if not WORD_TOKENIZATION_REGEXP.match(orig_word) or NUMBER_REGEXP.match(orig_word):
                     continue
             orig_diacs = np.array([x[::-1] if len(x) == 2 else (x, '') for x in extract_diacritics_2(orig_word)])
             pred_diacs = np.array([x[::-1] if len(x) == 2 else (x, '') for x in extract_diacritics_2(pred_word)])
             if orig_diacs.shape != pred_diacs.shape:  # Rare problematic scenario
                 print('Diacritization mismatch between original and predicted forms: {} {}'.format(orig_word,
                                                                                                    pred_word),
                       file=sys.stderr)
                 continue
             if not include_no_diacritic and WORD_TOKENIZATION_REGEXP.match(orig_word) and\
                     not NUMBER_REGEXP.match(orig_word):
                 diacritics_indexes = orig_diacs[:, 0] != ''
                 pred_diacs = pred_diacs[diacritics_indexes]
                 orig_diacs = orig_diacs[diacritics_indexes]
             correct_w += np.all(orig_diacs == pred_diacs)
             correct_wm += np.all(orig_diacs[:-1] == pred_diacs[:-1])
             total_w += 1
             correct_d += np.sum(np.all(orig_diacs == pred_diacs, axis=1))
             correct_dm += np.sum(np.all(orig_diacs[:-1] == pred_diacs[:-1], axis=1))
             total_d += orig_diacs.shape[0]
             total_dm += orig_diacs[:-1].shape[0]
         if i in logging_indexes:
             print('{}: {}/{} processed ({:.0%}).'.format(datetime.now(), i, len(test_sentences),
                                                          i/len(test_sentences)))
     return 1 - correct_d/total_d, 1 - correct_w/total_w, 1 - correct_dm/total_dm, 1 - correct_wm/total_w
Exemple #6
0
 def train(self, train_sentences, val_sentences, epochs, early_stop_iter):
     print('Removing unwanted characters...')
     train_sentences = self.remove_unwanted_chars(train_sentences)
     val_sentences = self.remove_unwanted_chars(val_sentences)
     print('Generating n-grams...')
     for sentence in train_sentences:
         words = ['<s>'] + sentence.split() + ['<e>']
         undiac_words = [clear_diacritics(w) for w in words]
         for w0_u, w1_d, w1_u, w2_u in zip(undiac_words[:-2], words[1:-1], undiac_words[1:-1], undiac_words[2:]):
             try:
                 self.undiacritized_vocabulary[w1_u].add(w1_d)
             except KeyError:
                 self.undiacritized_vocabulary[w1_u] = {w1_d}
             try:
                 try:
                     self.trigram_context[w0_u, w1_u, w2_u][w1_d] += 1
                 except KeyError:
                     self.trigram_context[w0_u, w1_u, w2_u][w1_d] = 1
             except KeyError:
                 self.trigram_context[w0_u, w1_u, w2_u] = {w1_d: 1}
             try:
                 try:
                     self.bigram_context[w0_u, w1_u][w1_d] += 1
                 except KeyError:
                     self.bigram_context[w0_u, w1_u][w1_d] = 1
             except KeyError:
                 self.bigram_context[w0_u, w1_u] = {w1_d: 1}
             try:
                 self.patterns[self.convert_to_pattern(w1_u)].add(self.convert_to_pattern(w1_d))
             except KeyError:
                 self.patterns[self.convert_to_pattern(w1_u)] = {self.convert_to_pattern(w1_d)}
     with open(self.get_trigrams_file_path(), 'wb') as vocab_file:
         pickle.dump(self.trigram_context, vocab_file)
     with open(self.get_bigrams_file_path(), 'wb') as vocab_file:
         pickle.dump(self.bigram_context, vocab_file)
     with open(self.get_unigrams_file_path(), 'wb') as vocab_file:
         pickle.dump(self.undiacritized_vocabulary, vocab_file)
     with open(self.get_patterns_file_path(), 'wb') as vocab_file:
         pickle.dump(self.patterns, vocab_file)
     print('Processing the dataset...')
     train_ins, train_outs = DiacritizationModel.generate_dataset(train_sentences)
     val_ins, val_outs = DiacritizationModel.generate_dataset(val_sentences)
     print('Calculating parameters...')
     total = 0
     shadda_count = 0
     harakat_counts = np.zeros((8,))
     for shadda_out, harakat_out in train_outs:
         total += len(shadda_out)
         shadda_count += sum(shadda_out)
         for i in set(harakat_out):
             harakat_counts[i] += harakat_out.count(i)
     shadda_weight = (total - shadda_count) / (shadda_count + 1)
     harakat_weights = np.max(harakat_counts) / (harakat_counts + 1)
     self.model.fit_generator(DiacritizedTextDataset(train_ins, train_outs), epochs=epochs,
                              validation_data=DiacritizedTextDataset(val_ins, val_outs),
                              class_weight=[{0: 1, 1: shadda_weight}, dict(enumerate(harakat_weights))],
                              callbacks=[ModelCheckpoint(self.get_weights_file_path(),
                                                         save_weights_only=True, save_best_only=True),
                                         LambdaCallback(on_epoch_end=self.save_history),
                                         EarlyStopping(patience=early_stop_iter, verbose=1),
                                         TerminateOnNaN()], workers=os.cpu_count())