def _combined_analyses(self, word_dediac, prefix_analyses, stem_analyses, suffix_analyses): combined = deque() for p in itertools.product(prefix_analyses, stem_analyses): prefix_cat = p[0][0] prefix_feats = p[0][1] stem_cat = p[1][0] stem_feats = p[1][1] if stem_cat in self._db.prefix_stem_compat[prefix_cat]: for suffix_cat, suffix_feats in suffix_analyses: if ((stem_cat not in self._db.stem_suffix_compat) or (prefix_cat not in self._db.prefix_suffix_compat) or (suffix_cat not in self._db.stem_suffix_compat[stem_cat]) or (suffix_cat not in self._db.prefix_suffix_compat[prefix_cat])): continue merged = merge_features(self._db, prefix_feats, stem_feats, suffix_feats) merged['stem'] = stem_feats['diac'] merged['stemcat'] = stem_cat merged_dediac = dediac_ar(merged['diac']) if word_dediac.replace(u'\u0640', '') != merged_dediac: merged['source'] = 'spvar' combined.append(merged) return combined
def disambiguate_word(self, sentence, word_ndx, top=1): word = sentence[word_ndx] word_dd = dediac_ar(word) if self._mle is not None and word_dd in self._mle: analyses = [ScoredAnalysis(1.0, self._mle[word_dd])] return DisambiguatedWord(word, analyses) else: analyses = self._analyzer.analyze(word_dd) if len(analyses) == 0: return DisambiguatedWord(word, []) probabilities = [10**_get_pos_lex_freq(a) for a in analyses] max_prob = max(probabilities) scored_analyses = [ ScoredAnalysis(p / max_prob, a) for a, p in zip(analyses, probabilities) ] scored_analyses.sort(key=lambda w: w.analysis['diac']) scored_analyses.sort(key=lambda w: len(w.analysis['bw'])) scored_analyses.sort(key=lambda w: w.score, reverse=True) if top < 1: return DisambiguatedWord(word, scored_analyses) else: return DisambiguatedWord(word, scored_analyses[0:top])
def dediacritize_normalize(word): ### Dediacritize word = dediac_ar(word) ### Alif normalize word = NORM_ALIF_RE.sub('ا', word) ### Yaa normalize word = NORM_YAA_RE.sub('ي', word) return word
def process_text(text): """ processes the input text by removing diacritics Args: input text Returns: processed text """ text = dediac.dediac_ar(text) return text
def _prepare_sentences(self, sentences): tokenized = [ ' '.join(simple_word_tokenize(dediac_ar(s))) for s in sentences ] sent_array = np.array(tokenized) x_trans = self._feat_union.transform(sent_array) x_trans_extra = self._feat_union_extra.transform(sent_array) x_predict_extra = self._classifier_extra.predict_proba(x_trans_extra) x_lm_feats = self._get_lm_feats_multi(sentences) x_final = sp.sparse.hstack((x_trans, x_lm_feats, x_predict_extra)) return x_final
def process_verse(self, verse): """ processes a verse by removing diacritics Args: input verse Returns: processed verse """ verse = dediac.dediac_ar(verse) return verse
def getStemWToken(self, wToken): # try: stemObject = self.arStemmer.analyze(wToken) # Remove Tashkeel and Normailse strText = dediac_ar(stemObject[0]['stem']) strText = normalize_teh_marbuta_ar(strText) # for Alha strText = normalize_alef_ar(strText) # for Alhamza strText = normalize_alef_maksura_ar(strText) return strText except: return wToken
def process_tweet(self, tweet): """ processes a tweet by normalizing letters, removing urls, and removing diacritics Args: input tweet Returns: processed tweet """ # URL regex URL_REG = re.compile(r'[https|http|@|RT]([^\s]+)') # space regex SPACE_REG = re.compile(r'[\s]+') tweet = normalize.normalize_unicode(tweet) tweet = dediac.dediac_ar(tweet) tweet = URL_REG.sub(' ', tweet) tweet = SPACE_REG.sub(' ', tweet) return tweet
def softCleaning (self, strText): # # Remove newline strText = strText.replace('\n', ' ') # # Remove Tashkeel strText = dediac_ar(strText) # # Clean by replacing any matched token with any item in the curated list .. for incorrectToken, correctedToken in self.CuratedList.items(): strText = strText.replace(incorrectToken, correctedToken) # # fix coma and semicolon .. strText = self.replaceWrongComa(strText) # # remove extra spaces strText = regEx.sub(" +", " ", strText) return strText
def _disambiguate_word(self, word): word_dd = dediac_ar(word) scored_analyses = self._scored_analyses(word_dd) return DisambiguatedWord(word, scored_analyses)
def reinflect(self, word, feats): """Generate analyses for a given word from a given set of inflectional features. Arguments: word (:obj:`str`): Word to reinflect. feats (:obj:`dict`): Dictionary of features. See :doc:`/reference/calima_star_features` for more information on features and their values. Returns: :obj:`list` of :obj:`dict`: List of generated analyses. See :doc:`/reference/calima_star_features` for more information on features and their values. Raises: :obj:`~camel_tools.calima_star.errors.InvalidReinflectorFeature`: If a feature is given that is not defined in database. :obj:`~camel_tools.calima_star.errors.InvalidReinflectorFeatureValue`: If an invalid value is given to a feature or if 'pos' feature is not defined. """ analyses = self._analyzer.analyze(word) if not analyses or len(analyses) == 0: return [] for feat in feats: if feat not in self._db.defines: raise InvalidReinflectorFeature(feat) elif self._db.defines[feat] is not None: if feat in _ANY_FEATS and feats[feat] == 'ANY': continue elif feats[feat] not in self._db.defines[feat]: raise InvalidReinflectorFeatureValue(feat, feats[feat]) has_clitics = False for feat in _CLITIC_FEATS: if feat in feats: has_clitics = True break results = deque() for analysis in analyses: if dediac_ar(analysis['diac']) != dediac_ar(word): continue if 'pos' in feats and feats['pos'] != analysis['pos']: continue lemma = _LEMMA_SPLIT_RE.split(analysis['lex'])[0] if 'lex' in feats and feats['lex'] != lemma: continue is_valid = True generate_feats = {} for feat in analysis.keys(): if feat in _IGNORED_FEATS: continue elif feat in _SPECIFIED_FEATS and feat not in feats: continue elif has_clitics and feat in _CLITIC_IGNORED_FEATS: continue else: if feat in feats: if feats[feat] == 'ANY': continue elif analysis[feat] != 'na': generate_feats[feat] = feats[feat] else: is_valid = False break elif analysis[feat] != 'na': generate_feats[feat] = analysis[feat] if is_valid: generated = self._generator.generate(lemma, generate_feats) if generated is not None: results.extend(generated) return list(results)
def analyze(self, word): """Analyze a given word. Args: word (:py:obj:`str`): Word to analyze. Returns: :obj:`list` of :obj:`dict`: The list of analyses for **word**. See :doc:`/reference/calima_star_features` for more information on features and their values. """ word = word.strip() if word == '': return [] analyses = deque() word_dediac = dediac_ar(word) word_normal = self._normalize(word_dediac) if ((self._strict_digit and _is_strict_digit(word)) or (not self._strict_digit and _is_digit(word))): result = copy.copy(self._db.defaults['digit']) result['diac'] = word result['stem'] = word result['stemgloss'] = word result['stemcat'] = None result['lex'] = word + '_0' result['bw'] = word + '/NOUN_NUM' result['source'] = 'digit' for feat in _COPY_FEATS: if feat in self._db.defines: result[feat] = word for feat in _UNDEFINED_LEX_FEATS: if feat in self._db.defines: result[feat] = 'DIGIT' if 'catib6' in self._db.defines: result['catib6'] = 'NOM' if 'ud' in self._db.defines: result['ud'] = 'NUM' result['pos_freq'] = -99.0 result['lex_freq'] = -99.0 result['pos_lex_freq'] = -99.0 return [result] elif _is_punc(word): result = copy.copy(self._db.defaults['punc']) result['diac'] = word result['stem'] = word result['stemgloss'] = word result['stemcat'] = None result['lex'] = word + '_0' result['bw'] = word + '/PUNC' result['source'] = 'punc' for feat in _COPY_FEATS: if feat in self._db.defines: result[feat] = word for feat in _UNDEFINED_LEX_FEATS: if feat in self._db.defines: result[feat] = 'PUNC' if 'catib6' in self._db.defines: result['catib6'] = 'PNX' if 'ud' in self._db.defines: result['ud'] = 'PUNCT' result['pos_freq'] = -99.0 result['lex_freq'] = -99.0 result['pos_lex_freq'] = -99.0 return [result] elif _has_punc(word): pass elif not _is_ar(word): result = copy.copy(self._db.defaults['noun']) result['diac'] = word result['stem'] = word result['stemgloss'] = word result['stemcat'] = None result['lex'] = word + '_0' result['bw'] = word + '/FOREIGN' result['source'] = 'foreign' for feat in _COPY_FEATS: if feat in self._db.defines: result[feat] = word for feat in _UNDEFINED_LEX_FEATS: if feat in self._db.defines: result[feat] = 'FOREIGN' if 'catib6' in self._db.defines: result['catib6'] = 'FOREIGN' if 'ud' in self._db.defines: result['ud'] = 'X' result['pos_freq'] = -99.0 result['lex_freq'] = -99.0 result['pos_lex_freq'] = -99.0 return [result] else: segments_gen = _segments_gen(word_normal, self._db.max_prefix_size, self._db.max_suffix_size) for segmentation in segments_gen: prefix = segmentation[0] stem = segmentation[1] suffix = segmentation[2] prefix_analyses = self._db.prefix_hash.get(prefix, None) suffix_analyses = self._db.suffix_hash.get(suffix, None) if prefix_analyses is None or suffix_analyses is None: continue stem_analyses = self._db.stem_hash.get(stem, None) if stem_analyses is not None: combined = self._combined_analyses(word_dediac, prefix_analyses, stem_analyses, suffix_analyses) analyses.extend(combined) if ((self._backoff_condition == 'NOAN' and len(analyses) == 0) or (self._backoff_condition == 'ADD')): segments_gen = _segments_gen(word_normal, self._db.max_prefix_size, self._db.max_suffix_size) backoff_cats = self._db.stem_backoffs[self._backoff_action] stem_analyses = [(cat, analysis) for cat, analysis in self._db.stem_hash['NOAN'] if cat in backoff_cats] for segmentation in segments_gen: prefix = segmentation[0] stem = segmentation[1] suffix = segmentation[2] prefix_analyses = self._db.prefix_hash.get(prefix, None) suffix_analyses = self._db.suffix_hash.get(suffix, None) if prefix_analyses is None or suffix_analyses is None: continue combined = self._combined_backoff_analyses( stem, word_dediac, prefix_analyses, stem_analyses, suffix_analyses) analyses.extend(combined) result = list(analyses) return result
def _bwtok_dediac(tok): return _REMOVE_PLUSES.sub(r'\g<1>', dediac_ar(tok).strip('+_'))
def _default_dediac(tok): return dediac_ar(tok)
parser = argparse.ArgumentParser() parser.add_argument( "--input_file_dir", default=None, type=str, required=True, help="The input data dir.." ) parser.add_argument("--output_file_dir", default=None, type=str, help="The path of the output file" ) args = parser.parse_args() puncs = string.punctuation + ''.join(list(UNICODE_PUNCT_CHARSET)) output_file = open(args.output_file_dir, mode='w', encoding='utf8') with open(args.input_file_dir, encoding='utf8') as f: for line in f.readlines(): line = line.strip() line = dediac.dediac_ar(line) line = re.sub(r'([' + re.escape(puncs) + '])(?!\d)', r' \1 ', line) output_file.write(line) output_file.write('\n') output_file.close()
def _de_diacritize(self, arb_text: str) -> str: de_diac_text = dediac_ar(arb_text) return de_diac_text
def reinflect(self, word, feats): """Generate surface forms and their associated analyses for a given word and a given set of (possibly underspecified) features. The surface form is accessed through the `diac` feature. Arguments: word (:obj:`str`): Word to reinflect. feats (:obj:`dict`): Dictionary of features. See :doc:`/reference/camel_morphology_features` for more information on features and their values. Returns: :obj:`list` of :obj:`dict`: List of generated analyses. See :doc:`/reference/camel_morphology_features` for more information on features and their values. Raises: :obj:`~camel_tools.morphology.errors.InvalidReinflectorFeature`: If a feature is given that is not defined in database. :obj:`~camel_tools.morphology.errors.InvalidReinflectorFeatureValue`: If an invalid value is given to a feature or if 'pos' feature is not defined. """ analyses = self._analyzer.analyze(word) if not analyses or len(analyses) == 0: return [] for feat in feats: if feat not in self._db.defines: raise InvalidReinflectorFeature(feat) elif self._db.defines[feat] is not None: if feat in _ANY_FEATS and feats[feat] == 'ANY': continue elif feats[feat] not in self._db.defines[feat]: raise InvalidReinflectorFeatureValue(feat, feats[feat]) has_clitics = False for feat in _CLITIC_FEATS: if feat in feats: has_clitics = True break results = deque() for analysis in analyses: if dediac_ar(analysis['diac']) != dediac_ar(word): continue if 'pos' in feats and feats['pos'] != analysis['pos']: continue lemma = _LEMMA_SPLIT_RE.split(analysis['lex'])[0] if 'lex' in feats and feats['lex'] != lemma: continue is_valid = True generate_feats = {} for feat in analysis.keys(): if feat in _IGNORED_FEATS: continue elif feat in _SPECIFIED_FEATS and feat not in feats: continue elif has_clitics and feat in _CLITIC_IGNORED_FEATS: continue else: if feat in feats: if feats[feat] == 'ANY': continue elif analysis[feat] != 'na': generate_feats[feat] = feats[feat] else: is_valid = False break elif analysis[feat] != 'na': generate_feats[feat] = analysis[feat] if is_valid: generated = self._generator.generate(lemma, generate_feats) if generated is not None: results.extend(generated) # TODO: Temporary fix to get unique analyses results = [dict(y) for y in set(tuple(x.items()) for x in results)] return list(results)
def __init__(self, disambiguator, scheme='atbtok', split=False, diac=False): self._disambiguator = disambiguator self._scheme = scheme self._split = split self._diacf = lambda w: w if diac else dediac_ar(w)