def __init__(self): self.CuratedList = self.loadCuratedList() self.stop_words = set(stopwords.words('arabic')) self.arStemmer = Analyzer(MorphologyDB.builtin_db()) self.sentSegRegexPattern = self.loadSentSegmentationList() self.DotChar = '_'
def pretrained(model_name='msa', top=1, use_gpu=True, batch_size=32, cache_size=10000): """Load a pre-trained model provided with camel_tools. Args: model_name (:obj:`str`, optional): Name of pre-trained model to load. Three models are available: 'msa', 'egy', and 'glf. Defaults to `msa`. top (:obj:`int`, optional): The maximum number of top analyses to return. Defaults to 1. use_gpu (:obj:`bool`, optional): The flag to use a GPU or not. Defaults to True. batch_size (:obj:`int`, optional): The batch size. Defaults to 32. cache_size (:obj:`int`, optional): If greater than zero, then the analyzer will cache the analyses for the cache_size most frequent words, otherwise no analyses will be cached. Defaults to 100000. Returns: :obj:`BERTUnfactoredDisambiguator`: Instance with loaded pre-trained model. """ model_info = CATALOGUE.get_dataset('DisambigBertUnfactored', model_name) model_config = _read_json(Path(model_info.path, 'default_config.json')) model_path = str(model_info.path) features = FEATURE_SET_MAP[model_config['feature']] db = MorphologyDB.builtin_db(model_config['db_name'], 'a') analyzer = Analyzer(db, backoff=model_config['backoff'], cache_size=cache_size) scorer = model_config['scorer'] tie_breaker = model_config['tie_breaker'] ranking_cache = model_config['ranking_cache'] return BERTUnfactoredDisambiguator(model_path, analyzer, top=top, features=features, scorer=scorer, tie_breaker=tie_breaker, use_gpu=use_gpu, batch_size=batch_size, ranking_cache=ranking_cache)
def load(lang, nlp=None): # Make sure the language is supported supported = {"en", "ar"} if lang not in supported: raise Exception("%s is an unsupported or unknown language" % lang) if lang == "en": # Load spacy nlp = nlp or spacy.load(lang, disable=["ner"]) # Load language edit merger merger = import_module("errant.%s.merger" % lang) # Load language edit classifier classifier = import_module("errant.%s.classifier" % lang) # The English classifier needs spacy classifier.nlp = nlp # Return a configured ERRANT annotator return Annotator(lang, nlp, merger, classifier) if lang == "ar": # Load spacy # nlp = nlp or spacy.load(lang, disable=["ner"]) db = MorphologyDB.builtin_db() analyzer = Analyzer(db) mled = MLEDisambiguator.pretrained() tagger = DefaultTagger(mled, 'pos') nlp = [analyzer, tagger] # Load language edit merger merger = import_module("errant.%s.merger" % lang) # Load language edit classifier classifier = import_module("errant.%s.classifier" % lang) # The English classifier needs spacy #classifier.nlp = nlp # Return a configured ERRANT annotator return Annotator(lang, nlp, merger, classifier)
def _calima_egy_r13_analyzer(): db = MorphologyDB.builtin_db('calima-egy-r13', 'a') analyzer = Analyzer(db, 'NOAN_PROP') return analyzer
if s_size > max_sentence: max_sentence = s_size sentence_size += s_size fd.close() print(min_sentence, max_sentence, sentence_size/len(sentences)) # Extract Morphological properties of every word from corpus db = MorphologyDB.builtin_db() analyzer = Analyzer(db) # # Create analyzer with NOAN_PROP backoff # analyzer = Analyzer(db, 'NOAN_PROP') training_set = [] for sentence in sentences: s = [] for word in sentence: analyses = analyzer.analyze(word['INPUT STRING']) # print(word, analyses) for d in analyses: # print(get_tag(d['bw']) == sentences[0][0]['POS'])
def main(): # pragma: no cover try: version = ('CAMeL Tools v{}'.format(__version__)) arguments = docopt(__doc__, version=version) if arguments.get('--list', False): _list_dbs() sys.exit(1) analyze = arguments.get('analyze', False) generate = arguments.get('generate', False) reinflect = arguments.get('reinflect', False) cache = arguments.get('--cache', False) backoff = arguments.get('--backoff', 'NONE') # Make sure we have a valid backoff mode if backoff is None: backoff = 'NONE' if analyze and backoff not in _ANALYSIS_BACKOFFS: sys.stderr.write('Error: invalid backoff mode.\n') sys.exit(1) if generate and backoff not in _GENARATION_BACKOFFS: sys.stderr.write('Error: invalid backoff mode.\n') sys.exit(1) # Open files (or just use stdin and stdout) fin, fout = _open_files(arguments['FILE'], arguments['--output']) # Determine required DB flags if analyze: dbflags = 'a' elif generate and backoff == 'NONE': dbflags = 'g' else: dbflags = 'r' # Load DB try: dbname = arguments.get('--db', _DEFAULT_DB) if dbname in _BUILTIN_DBS: db = MorphologyDB.builtin_db(dbname, dbflags) else: db = MorphologyDB(dbname, dbflags) except DatabaseError: sys.stderr.write('Error: Couldn\'t parse database.\n') sys.exit(1) except IOError: sys.stderr.write('Error: Database file could not be read.\n') sys.exit(1) # Continue execution in requested mode if analyze: try: _analyze(db, fin, fout, backoff, cache) except AnalyzerError as error: sys.stderr.write('Error: {}\n'.format(error.msg)) sys.exit(1) except IOError: sys.stderr.write('Error: An IO error occurred.\n') sys.exit(1) elif generate: try: _generate(db, fin, fout, backoff) except IOError: sys.stderr.write('Error: An IO error occurred.\n') sys.exit(1) elif reinflect: try: _reinflect(db, fin, fout) except IOError: sys.stderr.write('Error: An IO error occurred.\n') sys.exit(1) sys.exit(0) except KeyboardInterrupt: sys.stderr.write('Exiting...\n') sys.exit(1) except Exception: sys.stderr.write('Error: An unknown error occurred.\n') sys.exit(1)