def __init__(self, db): if not isinstance(db, CalimaStarDB): raise ReinflectorError('DB is not an instance of CalimaStarDB') if not db.flags.generation: raise ReinflectorError('DB does not support reinflection') self._db = db self._analyzer = CalimaStarAnalyzer(db) self._generator = CalimaStarGenerator(db)
def __init__(self, grammar, separator, min_base_length, dialects): self.separator = separator self.grammar_file = grammar self.min_base_length = min_base_length ### Dialects are represented as subgrammars that are merged to form the full grammar ## Each analysis produced by a grammar should report the subgrammar generating it ## These can then be pruned if they do not appear in the list of desired dialects here self.dialects = dialects ### The free built-in grammar database doesn't distinguish bases ## If you use this grammar, here's a cheap hack to predict the base token if self.grammar_file == 'built-in' or 'built-in' in self.dialects: ### The free grammar only supports the MSA variety of Arabic self.dialects = ['built-in', 'MSA'] self.grammar = CalimaStarDB(os.path.join(DESEG_DIR, 'grammar.db'), 'a') ### Order of tags used to predict which token belongs to base when multiple tags occur ## I did this in 5 minutes as proof of concept.. the order could be improved ## If you really want good results on MSA, consider buying the Sama database self.open_classes_hierarchy = [ 'NOUN', 'ADJ', 'VERB', 'IV', 'PV', 'CV', 'ADV', 'NOUN_PROP', 'IV_PASS', 'PV_PASS', 'VERB_PART', 'FOREIGN', 'PSEUDO_VERB', 'FOCUS_PART', 'REL_ADV', 'ABBREV', 'PART', 'INTERROG_PRON', 'REL_PRON', 'NOUN_QUANT', 'PRON_3MS', 'PRON_3MP', 'PRON_3D', 'PRON_2D' 'PRON_2MS', 'PRON_2FS', 'PRON_1S', 'PRON_2MS', 'PRON_2MP', 'PRON_3FS', 'PRON_3FP', 'PRON_2D', 'PRON_1P', 'DEM_PRON_FP', 'DEM_PRON_MP', 'DEM_PRON_MS', 'DEM_PRON', 'DEM_PRON_F', 'DEM_PRON_FD', 'DEM_PRON_MD', 'DEM_PRON_FS', 'FUT_PART', 'NEG_PART', 'VOC_PART', 'NOUN_NUM', 'PREP', 'SUB_CONJ', 'CONJ', 'INTERJ', 'INTERROG_ADV', 'INTERROG_PART', 'EXCLAM_PRON', 'NUMERIC_COMMA', 'PUNC', 'DET' ] else: ### Try to load the specified grammar database in analyze mode try: self.grammar = CalimaStarDB(grammar, 'a') ### Resort to the free built-in grammar database if the specified database can't be found except FileNotFoundError: stderr.write( '\nCould not locate grammar database "{}"\nLoading built-in database almor-msa\n' .format(grammar)) self.grammar = CalimaStarDB( os.path.join(DESEG_DIR, 'grammar.db'), 'a') self.grammar_file = 'built-in' self.dialects = ['built-in', 'MSA'] ### Run the analyzer in back-off mode, where input words can be any POS self.analyzer = CalimaStarAnalyzer(self.grammar, 'NOAN_ALL')
def _analyze(db, fin, fout, backoff, cache): analyzer = CalimaStarAnalyzer(db, backoff) memoize_table = {} if cache else None line = force_unicode(fin.readline()) while line: if len(line) == 0: line = force_unicode(fin.readline()) continue line = line.strip() tokens = _tokenize(line) for token in tokens: if cache and token in memoize_table: if six.PY3: fout.write(memoize_table[token]) else: fout.write(force_encoding(memoize_table[token])) fout.write('\n\n') else: analyses = analyzer.analyze(token) serialized = _serialize_analyses(fout, token, analyses, db.order) if cache: memoize_table[token] = serialized if six.PY3: fout.write(serialized) else: fout.write(force_encoding(serialized)) fout.write('\n\n') line = force_unicode(fin.readline())
def _analyze(db, fin, fout, backoff, cache, num_disambig=None): if cache: analyzer = CalimaStarAnalyzer(db, backoff, cache_size=1024) else: analyzer = CalimaStarAnalyzer(db, backoff) disambig = None if num_disambig is not None: disambig = MLEDisambiguator(analyzer) line = force_unicode(fin.readline()) while line: if len(line) == 0: line = force_unicode(fin.readline()) continue line = line.strip() tokens = _tokenize(line) for token in tokens: analyses = analyzer.analyze(token) if num_disambig is not None: dambg = disambig.disambiguate([token], num_disambig) analyses = [a.analysis for a in dambg[0].analyses] else: analyses = analyzer.analyze(token) serialized = _serialize_analyses(fout, token, analyses, db.order) if six.PY3: fout.write(serialized) else: fout.write(force_encoding(serialized)) fout.write('\n\n') line = force_unicode(fin.readline())
from camel_tools.calima_star.database import CalimaStarDB from camel_tools.calima_star.analyzer import CalimaStarAnalyzer import camel_tools.utils db = CalimaStarDB('/usr/local/lib/python3.7/site-packages/camel_tools/calima_star/databases/almor-msa/almor-msa-r13.db', 'a') analyzer = CalimaStarAnalyzer(db) PP4S = [ 'فتية', 'صبية', 'غلمة', 'جيرة', 'إخوة', 'شيخة', 'ثيرة' ] def isAVowel(char): if char == 'ا': return True if char == 'أ': return True if char == 'ي': return True if char == 'ى': return True if char == 'و': return True return False #أفعل
class CalimaStarReinflector(object): """CALIMA Star reinflector component. Arguments: db (:obj:`~camel_tools.calima_star.database.CalimaStarDB`): Database to use for generation. Must be opened in reinflection mode or both analysis and generation modes. Raises: :obj:`~camel_tools.calima_star.errors.ReinflectorError`: If **db** is not an instance of :obj:`~camel_tools.calima_star.database.CalimaStarDB` or if **db** does not support reinflection. """ def __init__(self, db): if not isinstance(db, CalimaStarDB): raise ReinflectorError('DB is not an instance of CalimaStarDB') if not db.flags.generation: raise ReinflectorError('DB does not support reinflection') self._db = db self._analyzer = CalimaStarAnalyzer(db) self._generator = CalimaStarGenerator(db) def reinflect(self, word, feats): """Generate analyses for a given word from a given set of inflectional features. Arguments: word (:obj:`str`): Word to reinflect. feats (:obj:`dict`): Dictionary of features. See :doc:`/reference/calima_star_features` for more information on features and their values. Returns: :obj:`list` of :obj:`dict`: List of generated analyses. See :doc:`/reference/calima_star_features` for more information on features and their values. Raises: :obj:`~camel_tools.calima_star.errors.InvalidReinflectorFeature`: If a feature is given that is not defined in database. :obj:`~camel_tools.calima_star.errors.InvalidReinflectorFeatureValue`: If an invalid value is given to a feature or if 'pos' feature is not defined. """ analyses = self._analyzer.analyze(word) if not analyses or len(analyses) == 0: return [] for feat in feats: if feat not in self._db.defines: raise InvalidReinflectorFeature(feat) elif self._db.defines[feat] is not None: if feat in _ANY_FEATS and feats[feat] == 'ANY': continue elif feats[feat] not in self._db.defines[feat]: raise InvalidReinflectorFeatureValue(feat, feats[feat]) has_clitics = False for feat in _CLITIC_FEATS: if feat in feats: has_clitics = True break results = deque() for analysis in analyses: if dediac_ar(analysis['diac']) != dediac_ar(word): continue if 'pos' in feats and feats['pos'] != analysis['pos']: continue lemma = _LEMMA_SPLIT_RE.split(analysis['lex'])[0] if 'lex' in feats and feats['lex'] != lemma: continue is_valid = True generate_feats = {} for feat in analysis.keys(): if feat in _IGNORED_FEATS: continue elif feat in _SPECIFIED_FEATS and feat not in feats: continue elif has_clitics and feat in _CLITIC_IGNORED_FEATS: continue else: if feat in feats: if feats[feat] == 'ANY': continue elif analysis[feat] != 'na': generate_feats[feat] = feats[feat] else: is_valid = False break elif analysis[feat] != 'na': generate_feats[feat] = analysis[feat] if is_valid: generated = self._generator.generate(lemma, generate_feats) if generated is not None: results.extend(generated) return list(results)
class Analyzer: """ This class should describe an analyzer that can take an input word and run it through some de-lexical grammar The get_possible_segmentations function should return a set of tiples Each triple will represent a possible segmentation The first item in the triple is a potentially empty list of proclitics, The second item is the base, represented as a string The third item is a potentially empty list of enclitics ( [[proclitics1], base1, [enclitics1]], [[proclitic2], base2, [enclitics2]], ... ) """ def __init__(self, grammar, separator, min_base_length, dialects): self.separator = separator self.grammar_file = grammar self.min_base_length = min_base_length ### Dialects are represented as subgrammars that are merged to form the full grammar ## Each analysis produced by a grammar should report the subgrammar generating it ## These can then be pruned if they do not appear in the list of desired dialects here self.dialects = dialects ### The free built-in grammar database doesn't distinguish bases ## If you use this grammar, here's a cheap hack to predict the base token if self.grammar_file == 'built-in' or 'built-in' in self.dialects: ### The free grammar only supports the MSA variety of Arabic self.dialects = ['built-in', 'MSA'] self.grammar = CalimaStarDB(os.path.join(DESEG_DIR, 'grammar.db'), 'a') ### Order of tags used to predict which token belongs to base when multiple tags occur ## I did this in 5 minutes as proof of concept.. the order could be improved ## If you really want good results on MSA, consider buying the Sama database self.open_classes_hierarchy = [ 'NOUN', 'ADJ', 'VERB', 'IV', 'PV', 'CV', 'ADV', 'NOUN_PROP', 'IV_PASS', 'PV_PASS', 'VERB_PART', 'FOREIGN', 'PSEUDO_VERB', 'FOCUS_PART', 'REL_ADV', 'ABBREV', 'PART', 'INTERROG_PRON', 'REL_PRON', 'NOUN_QUANT', 'PRON_3MS', 'PRON_3MP', 'PRON_3D', 'PRON_2D' 'PRON_2MS', 'PRON_2FS', 'PRON_1S', 'PRON_2MS', 'PRON_2MP', 'PRON_3FS', 'PRON_3FP', 'PRON_2D', 'PRON_1P', 'DEM_PRON_FP', 'DEM_PRON_MP', 'DEM_PRON_MS', 'DEM_PRON', 'DEM_PRON_F', 'DEM_PRON_FD', 'DEM_PRON_MD', 'DEM_PRON_FS', 'FUT_PART', 'NEG_PART', 'VOC_PART', 'NOUN_NUM', 'PREP', 'SUB_CONJ', 'CONJ', 'INTERJ', 'INTERROG_ADV', 'INTERROG_PART', 'EXCLAM_PRON', 'NUMERIC_COMMA', 'PUNC', 'DET' ] else: ### Try to load the specified grammar database in analyze mode try: self.grammar = CalimaStarDB(grammar, 'a') ### Resort to the free built-in grammar database if the specified database can't be found except FileNotFoundError: stderr.write( '\nCould not locate grammar database "{}"\nLoading built-in database almor-msa\n' .format(grammar)) self.grammar = CalimaStarDB( os.path.join(DESEG_DIR, 'grammar.db'), 'a') self.grammar_file = 'built-in' self.dialects = ['built-in', 'MSA'] ### Run the analyzer in back-off mode, where input words can be any POS self.analyzer = CalimaStarAnalyzer(self.grammar, 'NOAN_ALL') def get_possible_segmentations(self, word): ### Assumes input word is already normalized if necessary possible_segmentations = [] min_base_length = min(len(word), self.min_base_length) ### Run the analyzer try: analyses = self.analyzer.analyze(word) completed_analyses = {} ### Parse each analysis for analysis in analyses: ### Check the subgrammar that produced it dialect = self.get_analysis_dialect(analysis) if dialect in self.dialects: possible_segmentation = [[], None, [], dialect] ### Parse free built-in Almor grammar analysis if 'built-in' in self.dialects: analysis = dediacritize_normalize( self.accomodate_built_in_grammar(word, analysis)) ### Parse non-standard dialect subgrammar analysis elif dialect != 'MSA': analysis = dediacritize_normalize( self.accomodate_DA_grammar(word, analysis)) ### Parse Sama MSA grammar analysis else: analysis = dediacritize_normalize( analysis.get('d3seg', None)) ### If no analysis, default to the entire word as the base if analysis == None: possible_segmentation[1] = word possible_segmentations.append(possible_segmentation) break ### Make sure no segmentations leak into the segmentations ## (our grammars are adapted from databases designed for segmentation) if tuple([analysis, dialect]) not in completed_analyses: completed_analyses[tuple([analysis, dialect])] = True cat_tok = analysis.replace('+', '').replace('_', '') if cat_tok == word: ### Separate tokens analysis = analysis.split('_') ### Handle words entirely consisting of diacritics if len(analysis) == 0: possible_segmentation[1] = word ### For non-empty words else: ### Only consider tokens consisting of more than just diacritics all_tokens_empty = True for token in analysis: if len(token.strip(self.separator)) != 0: all_tokens_empty = False ### handle proclitics if self.separator == token[-1]: possible_segmentation[0].append( token) ### handle enclitics elif self.separator == token[0]: possible_segmentation[2].append( token) ### handle base else: possible_segmentation[1] = token ### Finish handling words entirely consisting of diacritics if all_tokens_empty: possible_segmentation[1] = word ### Prune ill-formed bases base = possible_segmentation[1] if base != None and len( base ) >= min_base_length and possible_segmentation not in possible_segmentations: # and base in self.vocabulary possible_segmentations.append( possible_segmentation) ### If inconsistency in the database, word will be the base with no clitics except KeyError: possible_segmentation = [[], word, [], 'MSA'] possible_segmentations.append(possible_segmentation) stderr.write( '\nGrammar database key error for {}\nUsing default segmentation analysis {}\n' .format(word, str(possible_segmentations))) ### And if no reasonable analyses are produced, default base is the word with no clitics if len(possible_segmentations) == 0: possible_segmentations = [[[], word, [], 'MSA']] return possible_segmentations def accomodate_DA_grammar(self, word, analysis): ### DA doesn't give D3tok so we need to parse diac analysis_seg = analysis['diac'].replace('_', '+').split('#') if len(analysis_seg) != 3: stderr.write('Bad Analysis!!!\n\t{}\n{}\n{}\n\n'.format( str(analysis_seg), str(word), str(analysis))) analysis_seg = ['', word, ''] tokens = [] proclitics = analysis_seg[0].split('+') for pro in proclitics: tokens.append('{}+_'.format(pro)) tokens.append(analysis_seg[1]) enclitics = analysis_seg[2].split('+') for en in enclitics: tokens.append('_+{}'.format(en)) return ''.join(tokens) def accomodate_built_in_grammar(self, word, analysis): ### Almor doesn't give D3tok so we need to parse BW analysis = analysis['bw'].replace('+', '/').strip('/').split('/') open_class_tag = None for open_class in self.open_classes_hierarchy: if open_class in analysis: open_class_tag = open_class break try: assert open_class_tag != None except: stderr.write( 'Could not find a base token!\nPlease add the problematic tag to the open_classes_hierarchy in the greedy_analyzer.py' ) stderr.write('{}\n'.format(word)) stderr.write('{}\n'.format(str(analysis))) stderr.write('{}\n'.format(str(self.open_classes_hierarchy))) exit() try: assert len(analysis) % 2 == 0 except: stderr.write('Malformed analysis!\n') stderr.write('{}\n'.format(word)) stderr.write('{}\n'.format(str(analysis))) exit() tokens = [] pro = True for m in range(0, len(analysis), 2): token = dediacritize_normalize(analysis[m]) if len(token) > 0: if pro and analysis[m + 1] == open_class_tag: pro = False tokens.append('{}'.format(token)) else: if pro: tokens.append('{}+_'.format(token)) else: tokens.append('_+{}'.format(token)) return ''.join(tokens) def get_analysis_dialect(self, analysis_dict): if 'built-in' in self.dialects: return 'MSA' else: dialect = DIALECT_RE.findall(analysis_dict['gloss']) if len(dialect) == 0: return None else: return dialect[0][1:4]
def __init__(self, analyzer_db_path): self.db = CalimaStarDB(analyzer_db_path) self.analyzer = CalimaStarAnalyzer(self.db, cache_size=46000) self.disambiguator = MLEDisambiguator(self.analyzer) self.w_to_features = {}