def __init__(self): self.CuratedList = self.loadCuratedList() self.stop_words = set(stopwords.words('arabic')) self.arStemmer = Analyzer(MorphologyDB.builtin_db()) self.sentSegRegexPattern = self.loadSentSegmentationList() self.DotChar = '_'
def pretrained(model_name='msa', top=1, use_gpu=True, batch_size=32, cache_size=10000): """Load a pre-trained model provided with camel_tools. Args: model_name (:obj:`str`, optional): Name of pre-trained model to load. Three models are available: 'msa', 'egy', and 'glf. Defaults to `msa`. top (:obj:`int`, optional): The maximum number of top analyses to return. Defaults to 1. use_gpu (:obj:`bool`, optional): The flag to use a GPU or not. Defaults to True. batch_size (:obj:`int`, optional): The batch size. Defaults to 32. cache_size (:obj:`int`, optional): If greater than zero, then the analyzer will cache the analyses for the cache_size most frequent words, otherwise no analyses will be cached. Defaults to 100000. Returns: :obj:`BERTUnfactoredDisambiguator`: Instance with loaded pre-trained model. """ model_info = CATALOGUE.get_dataset('DisambigBertUnfactored', model_name) model_config = _read_json(Path(model_info.path, 'default_config.json')) model_path = str(model_info.path) features = FEATURE_SET_MAP[model_config['feature']] db = MorphologyDB.builtin_db(model_config['db_name'], 'a') analyzer = Analyzer(db, backoff=model_config['backoff'], cache_size=cache_size) scorer = model_config['scorer'] tie_breaker = model_config['tie_breaker'] ranking_cache = model_config['ranking_cache'] return BERTUnfactoredDisambiguator(model_path, analyzer, top=top, features=features, scorer=scorer, tie_breaker=tie_breaker, use_gpu=use_gpu, batch_size=batch_size, ranking_cache=ranking_cache)
def pretrained_from_config(config, top=1, use_gpu=True, batch_size=32, cache_size=10000): """Load a pre-trained model from a config file. Args: config (:obj:`str`): Config file that defines the model details. Defaults to `None`. top (:obj:`int`, optional): The maximum number of top analyses to return. Defaults to 1. use_gpu (:obj:`bool`, optional): The flag to use a GPU or not. Defaults to True. batch_size (:obj:`int`, optional): The batch size. Defaults to 32. cache_size (:obj:`int`, optional): If greater than zero, then the analyzer will cache the analyses for the cache_size most frequent words, otherwise no analyses will be cached. Defaults to 100000. Returns: :obj:`BERTUnfactoredDisambiguator`: Instance with loaded pre-trained model. """ model_config = _read_json(config) model_path = model_config['model_path'] features = FEATURE_SET_MAP[model_config['feature']] db = MorphologyDB(model_config['db_path'], 'a') analyzer = Analyzer(db, backoff=model_config['backoff'], cache_size=cache_size) scorer = model_config['scorer'] tie_breaker = model_config['tie_breaker'] ranking_cache = model_config['ranking_cache'] return BERTUnfactoredDisambiguator(model_path, analyzer, top=top, features=features, scorer=scorer, tie_breaker=tie_breaker, use_gpu=use_gpu, batch_size=batch_size, ranking_cache=ranking_cache)
def load(lang, nlp=None): # Make sure the language is supported supported = {"en", "ar"} if lang not in supported: raise Exception("%s is an unsupported or unknown language" % lang) if lang == "en": # Load spacy nlp = nlp or spacy.load(lang, disable=["ner"]) # Load language edit merger merger = import_module("errant.%s.merger" % lang) # Load language edit classifier classifier = import_module("errant.%s.classifier" % lang) # The English classifier needs spacy classifier.nlp = nlp # Return a configured ERRANT annotator return Annotator(lang, nlp, merger, classifier) if lang == "ar": # Load spacy # nlp = nlp or spacy.load(lang, disable=["ner"]) db = MorphologyDB.builtin_db() analyzer = Analyzer(db) mled = MLEDisambiguator.pretrained() tagger = DefaultTagger(mled, 'pos') nlp = [analyzer, tagger] # Load language edit merger merger = import_module("errant.%s.merger" % lang) # Load language edit classifier classifier = import_module("errant.%s.classifier" % lang) # The English classifier needs spacy #classifier.nlp = nlp # Return a configured ERRANT annotator return Annotator(lang, nlp, merger, classifier)
def _calima_egy_r13_analyzer(): db = MorphologyDB.builtin_db('calima-egy-r13', 'a') analyzer = Analyzer(db, 'NOAN_PROP') return analyzer
if s_size > max_sentence: max_sentence = s_size sentence_size += s_size fd.close() print(min_sentence, max_sentence, sentence_size/len(sentences)) # Extract Morphological properties of every word from corpus db = MorphologyDB.builtin_db() analyzer = Analyzer(db) # # Create analyzer with NOAN_PROP backoff # analyzer = Analyzer(db, 'NOAN_PROP') training_set = [] for sentence in sentences: s = [] for word in sentence: analyses = analyzer.analyze(word['INPUT STRING']) # print(word, analyses) for d in analyses: # print(get_tag(d['bw']) == sentences[0][0]['POS'])
def main(): # pragma: no cover try: version = ('CAMeL Tools v{}'.format(__version__)) arguments = docopt(__doc__, version=version) if arguments.get('--list', False): _list_dbs() sys.exit(1) analyze = arguments.get('analyze', False) generate = arguments.get('generate', False) reinflect = arguments.get('reinflect', False) cache = arguments.get('--cache', False) backoff = arguments.get('--backoff', 'NONE') # Make sure we have a valid backoff mode if backoff is None: backoff = 'NONE' if analyze and backoff not in _ANALYSIS_BACKOFFS: sys.stderr.write('Error: invalid backoff mode.\n') sys.exit(1) if generate and backoff not in _GENARATION_BACKOFFS: sys.stderr.write('Error: invalid backoff mode.\n') sys.exit(1) # Open files (or just use stdin and stdout) fin, fout = _open_files(arguments['FILE'], arguments['--output']) # Determine required DB flags if analyze: dbflags = 'a' elif generate and backoff == 'NONE': dbflags = 'g' else: dbflags = 'r' # Load DB try: dbname = arguments.get('--db', _DEFAULT_DB) if dbname in _BUILTIN_DBS: db = MorphologyDB.builtin_db(dbname, dbflags) else: db = MorphologyDB(dbname, dbflags) except DatabaseError: sys.stderr.write('Error: Couldn\'t parse database.\n') sys.exit(1) except IOError: sys.stderr.write('Error: Database file could not be read.\n') sys.exit(1) # Continue execution in requested mode if analyze: try: _analyze(db, fin, fout, backoff, cache) except AnalyzerError as error: sys.stderr.write('Error: {}\n'.format(error.msg)) sys.exit(1) except IOError: sys.stderr.write('Error: An IO error occurred.\n') sys.exit(1) elif generate: try: _generate(db, fin, fout, backoff) except IOError: sys.stderr.write('Error: An IO error occurred.\n') sys.exit(1) elif reinflect: try: _reinflect(db, fin, fout) except IOError: sys.stderr.write('Error: An IO error occurred.\n') sys.exit(1) sys.exit(0) except KeyboardInterrupt: sys.stderr.write('Exiting...\n') sys.exit(1) except Exception: sys.stderr.write('Error: An unknown error occurred.\n') sys.exit(1)
def _list_dbs(): for db in sorted(MorphologyDB.list_builtin_dbs()): sys.stdout.write('{}\t{}\n'.format(db.name, db.version))
import camel_tools as camelt from camel_tools.utils.charsets import AR_DIAC_CHARSET from camel_tools.utils.stringutils import force_unicode, force_encoding from camel_tools.morphology.database import MorphologyDB from camel_tools.morphology.analyzer import Analyzer from camel_tools.morphology.generator import Generator from camel_tools.morphology.reinflector import Reinflector from camel_tools.morphology.errors import DatabaseError, AnalyzerError from camel_tools.morphology.errors import GeneratorError, MorphologyError __version__ = camelt.__version__ _ANALYSIS_BACKOFFS = frozenset( ('NONE', 'NOAN_ALL', 'NOAN_PROP', 'ADD_ALL', 'ADD_PROP')) _GENARATION_BACKOFFS = frozenset(('NONE', 'REINFLECT')) _BUILTIN_DBS = frozenset([db.name for db in MorphologyDB.list_builtin_dbs()]) _DEFAULT_DB = 'calima-msa-r13' _DIAC_RE = re.compile(r'[' + re.escape(u''.join(AR_DIAC_CHARSET)) + r']') def _tokenize(s): return s.split() def _dediac(word): return _DIAC_RE.sub('', word) def _to_int(s): s = str(s)
""" import re import sys from docopt import docopt import camel_tools from camel_tools.morphology.database import MorphologyDB from camel_tools.disambig.mle import MLEDisambiguator from camel_tools.tokenizers.word import simple_word_tokenize from camel_tools.cli.utils import open_files __version__ = camel_tools.__version__ _BUILTIN_DBS = MorphologyDB.list_builtin_dbs() _DEFAULT_DB = 'calima-msa-r13' _WHITESPACE_RE = re.compile(r'\s+|\S+') def _diac_tokens(tokens, disambig, ignore_markers, marker, strip_markers, pretokenized): result = [] for token in tokens: if len(token.strip()) == 0: result.append(token) elif ignore_markers and token.startswith(marker): if strip_markers: result.append(token[len(marker):]) else: