コード例 #1
0
    def _combined_analyses(self, word_dediac, prefix_analyses, stem_analyses,
                           suffix_analyses):
        combined = deque()

        for p in itertools.product(prefix_analyses, stem_analyses):
            prefix_cat = p[0][0]
            prefix_feats = p[0][1]
            stem_cat = p[1][0]
            stem_feats = p[1][1]

            if stem_cat in self._db.prefix_stem_compat[prefix_cat]:
                for suffix_cat, suffix_feats in suffix_analyses:
                    if ((stem_cat not in self._db.stem_suffix_compat) or
                        (prefix_cat not in self._db.prefix_suffix_compat)
                            or (suffix_cat
                                not in self._db.stem_suffix_compat[stem_cat])
                            or
                        (suffix_cat
                         not in self._db.prefix_suffix_compat[prefix_cat])):
                        continue

                    merged = merge_features(self._db, prefix_feats, stem_feats,
                                            suffix_feats)
                    merged['stem'] = stem_feats['diac']
                    merged['stemcat'] = stem_cat

                    merged_dediac = dediac_ar(merged['diac'])
                    if word_dediac.replace(u'\u0640', '') != merged_dediac:
                        merged['source'] = 'spvar'

                    combined.append(merged)

        return combined
コード例 #2
0
    def disambiguate_word(self, sentence, word_ndx, top=1):
        word = sentence[word_ndx]
        word_dd = dediac_ar(word)

        if self._mle is not None and word_dd in self._mle:
            analyses = [ScoredAnalysis(1.0, self._mle[word_dd])]
            return DisambiguatedWord(word, analyses)

        else:
            analyses = self._analyzer.analyze(word_dd)

            if len(analyses) == 0:
                return DisambiguatedWord(word, [])

            probabilities = [10**_get_pos_lex_freq(a) for a in analyses]
            max_prob = max(probabilities)

            scored_analyses = [
                ScoredAnalysis(p / max_prob, a)
                for a, p in zip(analyses, probabilities)
            ]

            scored_analyses.sort(key=lambda w: w.analysis['diac'])
            scored_analyses.sort(key=lambda w: len(w.analysis['bw']))
            scored_analyses.sort(key=lambda w: w.score, reverse=True)

            if top < 1:
                return DisambiguatedWord(word, scored_analyses)
            else:
                return DisambiguatedWord(word, scored_analyses[0:top])
コード例 #3
0
ファイル: analyzer.py プロジェクト: CAMeL-Lab/deSeg
def dediacritize_normalize(word):

    ### Dediacritize
    word = dediac_ar(word)
    ### Alif normalize
    word = NORM_ALIF_RE.sub('ا', word)
    ### Yaa normalize
    word = NORM_YAA_RE.sub('ي', word)

    return word
コード例 #4
0
def process_text(text):
    """
    processes the input text by removing diacritics
    Args:
        input text
    Returns:
        processed text
    """

    text = dediac.dediac_ar(text)
    return text
コード例 #5
0
ファイル: __init__.py プロジェクト: salohnana2018/camel_tools
 def _prepare_sentences(self, sentences):
     tokenized = [
         ' '.join(simple_word_tokenize(dediac_ar(s))) for s in sentences
     ]
     sent_array = np.array(tokenized)
     x_trans = self._feat_union.transform(sent_array)
     x_trans_extra = self._feat_union_extra.transform(sent_array)
     x_predict_extra = self._classifier_extra.predict_proba(x_trans_extra)
     x_lm_feats = self._get_lm_feats_multi(sentences)
     x_final = sp.sparse.hstack((x_trans, x_lm_feats, x_predict_extra))
     return x_final
コード例 #6
0
ファイル: data_utils.py プロジェクト: CAMeL-Lab/CAMeLBERT
    def process_verse(self, verse):
        """
        processes a verse by removing diacritics

        Args:
            input verse
        Returns:
            processed verse
        """

        verse = dediac.dediac_ar(verse)
        return verse
コード例 #7
0
 def getStemWToken(self, wToken):
     #
     try:            
         stemObject = self.arStemmer.analyze(wToken)
         
         # Remove Tashkeel and Normailse
         strText = dediac_ar(stemObject[0]['stem'])
         strText = normalize_teh_marbuta_ar(strText)   # for Alha
         strText = normalize_alef_ar(strText)          # for Alhamza
         strText = normalize_alef_maksura_ar(strText)         
         return strText
     except:
         return wToken
コード例 #8
0
ファイル: data_utils.py プロジェクト: CAMeL-Lab/CAMeLBERT
    def process_tweet(self, tweet):
        """
        processes a tweet by normalizing letters, removing urls,
        and removing diacritics

        Args:
            input tweet
        Returns:
            processed tweet
        """

        # URL regex
        URL_REG = re.compile(r'[https|http|@|RT]([^\s]+)')
        # space regex
        SPACE_REG = re.compile(r'[\s]+')

        tweet = normalize.normalize_unicode(tweet)
        tweet = dediac.dediac_ar(tweet)
        tweet = URL_REG.sub(' ', tweet)
        tweet = SPACE_REG.sub(' ', tweet)
        return tweet
コード例 #9
0
 def softCleaning (self, strText):
     
     #
     # Remove newline
     strText = strText.replace('\n', ' ')
     
     #
     # Remove Tashkeel
     strText = dediac_ar(strText)
     
     #
     # Clean by replacing any matched token with any item in the curated list .. 
     for incorrectToken, correctedToken in self.CuratedList.items():            
         strText = strText.replace(incorrectToken, correctedToken)
     
     #
     # fix coma and semicolon ..
     strText = self.replaceWrongComa(strText)
     
     #
     # remove extra spaces 
     strText = regEx.sub(" +", " ", strText)
     
     return strText
コード例 #10
0
ファイル: mle.py プロジェクト: Salah856/camel_tools
    def _disambiguate_word(self, word):
        word_dd = dediac_ar(word)
        scored_analyses = self._scored_analyses(word_dd)

        return DisambiguatedWord(word, scored_analyses)
コード例 #11
0
    def reinflect(self, word, feats):
        """Generate analyses for a given word from a given set of inflectional
        features.

        Arguments:
            word (:obj:`str`): Word to reinflect.
            feats (:obj:`dict`): Dictionary of features.
                See :doc:`/reference/calima_star_features` for more information
                on features and their values.

        Returns:
            :obj:`list` of :obj:`dict`: List of generated analyses.
            See :doc:`/reference/calima_star_features` for more information on
            features and their values.

        Raises:
            :obj:`~camel_tools.calima_star.errors.InvalidReinflectorFeature`:
                If a feature is given that is not defined in database.
            :obj:`~camel_tools.calima_star.errors.InvalidReinflectorFeatureValue`:
                If an invalid value is given to a feature or if 'pos' feature
                is not defined.
        """

        analyses = self._analyzer.analyze(word)

        if not analyses or len(analyses) == 0:
            return []

        for feat in feats:
            if feat not in self._db.defines:
                raise InvalidReinflectorFeature(feat)
            elif self._db.defines[feat] is not None:
                if feat in _ANY_FEATS and feats[feat] == 'ANY':
                    continue
                elif feats[feat] not in self._db.defines[feat]:
                    raise InvalidReinflectorFeatureValue(feat, feats[feat])

        has_clitics = False
        for feat in _CLITIC_FEATS:
            if feat in feats:
                has_clitics = True
                break

        results = deque()

        for analysis in analyses:
            if dediac_ar(analysis['diac']) != dediac_ar(word):
                continue

            if 'pos' in feats and feats['pos'] != analysis['pos']:
                continue

            lemma = _LEMMA_SPLIT_RE.split(analysis['lex'])[0]

            if 'lex' in feats and feats['lex'] != lemma:
                continue

            is_valid = True
            generate_feats = {}

            for feat in analysis.keys():
                if feat in _IGNORED_FEATS:
                    continue
                elif feat in _SPECIFIED_FEATS and feat not in feats:
                    continue
                elif has_clitics and feat in _CLITIC_IGNORED_FEATS:
                    continue
                else:
                    if feat in feats:
                        if feats[feat] == 'ANY':
                            continue
                        elif analysis[feat] != 'na':
                            generate_feats[feat] = feats[feat]
                        else:
                            is_valid = False
                            break
                    elif analysis[feat] != 'na':
                        generate_feats[feat] = analysis[feat]

            if is_valid:
                generated = self._generator.generate(lemma, generate_feats)
                if generated is not None:
                    results.extend(generated)

        return list(results)
コード例 #12
0
    def analyze(self, word):
        """Analyze a given word.

        Args:
            word (:py:obj:`str`): Word to analyze.

        Returns:
            :obj:`list` of :obj:`dict`: The list of analyses for **word**.
            See :doc:`/reference/calima_star_features` for more information on
            features and their values.
        """

        word = word.strip()

        if word == '':
            return []

        analyses = deque()
        word_dediac = dediac_ar(word)
        word_normal = self._normalize(word_dediac)

        if ((self._strict_digit and _is_strict_digit(word))
                or (not self._strict_digit and _is_digit(word))):
            result = copy.copy(self._db.defaults['digit'])
            result['diac'] = word
            result['stem'] = word
            result['stemgloss'] = word
            result['stemcat'] = None
            result['lex'] = word + '_0'
            result['bw'] = word + '/NOUN_NUM'
            result['source'] = 'digit'

            for feat in _COPY_FEATS:
                if feat in self._db.defines:
                    result[feat] = word

            for feat in _UNDEFINED_LEX_FEATS:
                if feat in self._db.defines:
                    result[feat] = 'DIGIT'

            if 'catib6' in self._db.defines:
                result['catib6'] = 'NOM'
            if 'ud' in self._db.defines:
                result['ud'] = 'NUM'

            result['pos_freq'] = -99.0
            result['lex_freq'] = -99.0
            result['pos_lex_freq'] = -99.0

            return [result]

        elif _is_punc(word):
            result = copy.copy(self._db.defaults['punc'])
            result['diac'] = word
            result['stem'] = word
            result['stemgloss'] = word
            result['stemcat'] = None
            result['lex'] = word + '_0'
            result['bw'] = word + '/PUNC'
            result['source'] = 'punc'

            for feat in _COPY_FEATS:
                if feat in self._db.defines:
                    result[feat] = word

            for feat in _UNDEFINED_LEX_FEATS:
                if feat in self._db.defines:
                    result[feat] = 'PUNC'

            if 'catib6' in self._db.defines:
                result['catib6'] = 'PNX'
            if 'ud' in self._db.defines:
                result['ud'] = 'PUNCT'

            result['pos_freq'] = -99.0
            result['lex_freq'] = -99.0
            result['pos_lex_freq'] = -99.0

            return [result]

        elif _has_punc(word):
            pass

        elif not _is_ar(word):
            result = copy.copy(self._db.defaults['noun'])
            result['diac'] = word
            result['stem'] = word
            result['stemgloss'] = word
            result['stemcat'] = None
            result['lex'] = word + '_0'
            result['bw'] = word + '/FOREIGN'
            result['source'] = 'foreign'

            for feat in _COPY_FEATS:
                if feat in self._db.defines:
                    result[feat] = word

            for feat in _UNDEFINED_LEX_FEATS:
                if feat in self._db.defines:
                    result[feat] = 'FOREIGN'

            if 'catib6' in self._db.defines:
                result['catib6'] = 'FOREIGN'

            if 'ud' in self._db.defines:
                result['ud'] = 'X'

            result['pos_freq'] = -99.0
            result['lex_freq'] = -99.0
            result['pos_lex_freq'] = -99.0

            return [result]

        else:
            segments_gen = _segments_gen(word_normal, self._db.max_prefix_size,
                                         self._db.max_suffix_size)

            for segmentation in segments_gen:
                prefix = segmentation[0]
                stem = segmentation[1]
                suffix = segmentation[2]

                prefix_analyses = self._db.prefix_hash.get(prefix, None)
                suffix_analyses = self._db.suffix_hash.get(suffix, None)

                if prefix_analyses is None or suffix_analyses is None:
                    continue

                stem_analyses = self._db.stem_hash.get(stem, None)

                if stem_analyses is not None:
                    combined = self._combined_analyses(word_dediac,
                                                       prefix_analyses,
                                                       stem_analyses,
                                                       suffix_analyses)
                    analyses.extend(combined)

        if ((self._backoff_condition == 'NOAN' and len(analyses) == 0)
                or (self._backoff_condition == 'ADD')):

            segments_gen = _segments_gen(word_normal, self._db.max_prefix_size,
                                         self._db.max_suffix_size)

            backoff_cats = self._db.stem_backoffs[self._backoff_action]
            stem_analyses = [(cat, analysis)
                             for cat, analysis in self._db.stem_hash['NOAN']
                             if cat in backoff_cats]

            for segmentation in segments_gen:
                prefix = segmentation[0]
                stem = segmentation[1]
                suffix = segmentation[2]

                prefix_analyses = self._db.prefix_hash.get(prefix, None)
                suffix_analyses = self._db.suffix_hash.get(suffix, None)

                if prefix_analyses is None or suffix_analyses is None:
                    continue

                combined = self._combined_backoff_analyses(
                    stem, word_dediac, prefix_analyses, stem_analyses,
                    suffix_analyses)
                analyses.extend(combined)

        result = list(analyses)

        return result
コード例 #13
0
def _bwtok_dediac(tok):
    return _REMOVE_PLUSES.sub(r'\g<1>', dediac_ar(tok).strip('+_'))
コード例 #14
0
def _default_dediac(tok):
    return dediac_ar(tok)
コード例 #15
0
parser = argparse.ArgumentParser()
parser.add_argument( "--input_file_dir",
                    default=None,
                    type=str,
                    required=True,
                    help="The input data dir.."
                    )

parser.add_argument("--output_file_dir",
                    default=None,
                    type=str,
                    help="The path of the output file"
                    )

args = parser.parse_args()

puncs = string.punctuation + ''.join(list(UNICODE_PUNCT_CHARSET))

output_file = open(args.output_file_dir, mode='w', encoding='utf8')

with open(args.input_file_dir, encoding='utf8') as f:
    for line in f.readlines():
        line = line.strip()
        line = dediac.dediac_ar(line)
        line = re.sub(r'([' + re.escape(puncs) + '])(?!\d)', r' \1 ', line)
        output_file.write(line)
        output_file.write('\n')

output_file.close()
コード例 #16
0
 def _de_diacritize(self, arb_text: str) -> str:
     de_diac_text = dediac_ar(arb_text)
     return de_diac_text
コード例 #17
0
    def reinflect(self, word, feats):
        """Generate surface forms and their associated analyses for a given 
        word and a given set of (possibly underspecified) features. 
        The surface form is accessed through the `diac` feature.

        Arguments:
            word (:obj:`str`): Word to reinflect.
            feats (:obj:`dict`): Dictionary of features.
                See :doc:`/reference/camel_morphology_features` for more
                information on features and their values.

        Returns:
            :obj:`list` of :obj:`dict`: List of generated analyses.
            See :doc:`/reference/camel_morphology_features` for more
            information on features and their values.

        Raises:
            :obj:`~camel_tools.morphology.errors.InvalidReinflectorFeature`:
                If a feature is given that is not defined in database.
            :obj:`~camel_tools.morphology.errors.InvalidReinflectorFeatureValue`:
                If an invalid value is given to a feature or if 'pos' feature
                is not defined.
        """

        analyses = self._analyzer.analyze(word)

        if not analyses or len(analyses) == 0:
            return []

        for feat in feats:
            if feat not in self._db.defines:
                raise InvalidReinflectorFeature(feat)
            elif self._db.defines[feat] is not None:
                if feat in _ANY_FEATS and feats[feat] == 'ANY':
                    continue
                elif feats[feat] not in self._db.defines[feat]:
                    raise InvalidReinflectorFeatureValue(feat, feats[feat])

        has_clitics = False
        for feat in _CLITIC_FEATS:
            if feat in feats:
                has_clitics = True
                break

        results = deque()

        for analysis in analyses:
            if dediac_ar(analysis['diac']) != dediac_ar(word):
                continue

            if 'pos' in feats and feats['pos'] != analysis['pos']:
                continue

            lemma = _LEMMA_SPLIT_RE.split(analysis['lex'])[0]

            if 'lex' in feats and feats['lex'] != lemma:
                continue

            is_valid = True
            generate_feats = {}

            for feat in analysis.keys():
                if feat in _IGNORED_FEATS:
                    continue
                elif feat in _SPECIFIED_FEATS and feat not in feats:
                    continue
                elif has_clitics and feat in _CLITIC_IGNORED_FEATS:
                    continue
                else:
                    if feat in feats:
                        if feats[feat] == 'ANY':
                            continue
                        elif analysis[feat] != 'na':
                            generate_feats[feat] = feats[feat]
                        else:
                            is_valid = False
                            break
                    elif analysis[feat] != 'na':
                        generate_feats[feat] = analysis[feat]

            if is_valid:
                generated = self._generator.generate(lemma, generate_feats)
                if generated is not None:
                    results.extend(generated)

        # TODO: Temporary fix to get unique analyses
        results = [dict(y) for y in set(tuple(x.items()) for x in results)]

        return list(results)
コード例 #18
0
 def __init__(self, disambiguator, scheme='atbtok', split=False,
              diac=False):
     self._disambiguator = disambiguator
     self._scheme = scheme
     self._split = split
     self._diacf = lambda w: w if diac else dediac_ar(w)