Exemple #1
0
 def __init__(self):
     
     self.CuratedList = self.loadCuratedList()
     self.stop_words = set(stopwords.words('arabic'))
     self.arStemmer = Analyzer(MorphologyDB.builtin_db())
     self.sentSegRegexPattern = self.loadSentSegmentationList()        
     self.DotChar = '_'
Exemple #2
0
    def pretrained(model_name='msa',
                   top=1,
                   use_gpu=True,
                   batch_size=32,
                   cache_size=10000):
        """Load a pre-trained model provided with camel_tools.

        Args:
            model_name (:obj:`str`, optional): Name of pre-trained model to
                load. Three models are available: 'msa', 'egy', and 'glf.
                Defaults to `msa`.
            top (:obj:`int`, optional): The maximum number of top analyses to
                return. Defaults to 1.
            use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
                Defaults to True.
            batch_size (:obj:`int`, optional): The batch size. Defaults to 32.
            cache_size (:obj:`int`, optional): If greater than zero, then
                the analyzer will cache the analyses for the cache_size most
                frequent words, otherwise no analyses will be cached.
                Defaults to 100000.

        Returns:
            :obj:`BERTUnfactoredDisambiguator`: Instance with loaded
            pre-trained model.
        """

        model_info = CATALOGUE.get_dataset('DisambigBertUnfactored',
                                           model_name)
        model_config = _read_json(Path(model_info.path, 'default_config.json'))
        model_path = str(model_info.path)
        features = FEATURE_SET_MAP[model_config['feature']]
        db = MorphologyDB.builtin_db(model_config['db_name'], 'a')
        analyzer = Analyzer(db,
                            backoff=model_config['backoff'],
                            cache_size=cache_size)
        scorer = model_config['scorer']
        tie_breaker = model_config['tie_breaker']
        ranking_cache = model_config['ranking_cache']

        return BERTUnfactoredDisambiguator(model_path,
                                           analyzer,
                                           top=top,
                                           features=features,
                                           scorer=scorer,
                                           tie_breaker=tie_breaker,
                                           use_gpu=use_gpu,
                                           batch_size=batch_size,
                                           ranking_cache=ranking_cache)
Exemple #3
0
def load(lang, nlp=None):
    # Make sure the language is supported
    supported = {"en", "ar"}
    if lang not in supported:
        raise Exception("%s is an unsupported or unknown language" % lang)

    if lang == "en":
        # Load spacy
        nlp = nlp or spacy.load(lang, disable=["ner"])

        # Load language edit merger
        merger = import_module("errant.%s.merger" % lang)

        # Load language edit classifier
        classifier = import_module("errant.%s.classifier" % lang)
        # The English classifier needs spacy
        classifier.nlp = nlp

        # Return a configured ERRANT annotator
        return Annotator(lang, nlp, merger, classifier)

    if lang == "ar":
        # Load spacy
        # nlp = nlp or spacy.load(lang, disable=["ner"])
        db = MorphologyDB.builtin_db()
        analyzer = Analyzer(db)
        mled = MLEDisambiguator.pretrained()
        tagger = DefaultTagger(mled, 'pos')
        nlp = [analyzer, tagger]

        # Load language edit merger
        merger = import_module("errant.%s.merger" % lang)

        # Load language edit classifier
        classifier = import_module("errant.%s.classifier" % lang)
        # The English classifier needs spacy
        #classifier.nlp = nlp

        # Return a configured ERRANT annotator
        return Annotator(lang, nlp, merger, classifier)
Exemple #4
0
def _calima_egy_r13_analyzer():
    db = MorphologyDB.builtin_db('calima-egy-r13', 'a')
    analyzer = Analyzer(db, 'NOAN_PROP')
    return analyzer
    if s_size > max_sentence:
        max_sentence = s_size

    sentence_size += s_size
    

    fd.close()
    


print(min_sentence, max_sentence, sentence_size/len(sentences))

# Extract Morphological properties of every word from corpus

db = MorphologyDB.builtin_db()
analyzer = Analyzer(db)

# # Create analyzer with NOAN_PROP backoff
# analyzer = Analyzer(db, 'NOAN_PROP')

training_set = []

for sentence in sentences:
    s = []
    for word in sentence:
        
        analyses = analyzer.analyze(word['INPUT STRING'])
        # print(word, analyses)
        for d in analyses:
            # print(get_tag(d['bw']) == sentences[0][0]['POS'])
Exemple #6
0
def main():  # pragma: no cover
    try:
        version = ('CAMeL Tools v{}'.format(__version__))
        arguments = docopt(__doc__, version=version)

        if arguments.get('--list', False):
            _list_dbs()
            sys.exit(1)

        analyze = arguments.get('analyze', False)
        generate = arguments.get('generate', False)
        reinflect = arguments.get('reinflect', False)

        cache = arguments.get('--cache', False)
        backoff = arguments.get('--backoff', 'NONE')

        # Make sure we have a valid backoff mode
        if backoff is None:
            backoff = 'NONE'
        if analyze and backoff not in _ANALYSIS_BACKOFFS:
            sys.stderr.write('Error: invalid backoff mode.\n')
            sys.exit(1)
        if generate and backoff not in _GENARATION_BACKOFFS:
            sys.stderr.write('Error: invalid backoff mode.\n')
            sys.exit(1)

        # Open files (or just use stdin and stdout)
        fin, fout = _open_files(arguments['FILE'], arguments['--output'])

        # Determine required DB flags
        if analyze:
            dbflags = 'a'
        elif generate and backoff == 'NONE':
            dbflags = 'g'
        else:
            dbflags = 'r'

        # Load DB
        try:
            dbname = arguments.get('--db', _DEFAULT_DB)
            if dbname in _BUILTIN_DBS:
                db = MorphologyDB.builtin_db(dbname, dbflags)
            else:
                db = MorphologyDB(dbname, dbflags)
        except DatabaseError:
            sys.stderr.write('Error: Couldn\'t parse database.\n')
            sys.exit(1)
        except IOError:
            sys.stderr.write('Error: Database file could not be read.\n')
            sys.exit(1)

        # Continue execution in requested mode
        if analyze:
            try:
                _analyze(db, fin, fout, backoff, cache)
            except AnalyzerError as error:
                sys.stderr.write('Error: {}\n'.format(error.msg))
                sys.exit(1)
            except IOError:
                sys.stderr.write('Error: An IO error occurred.\n')
                sys.exit(1)

        elif generate:
            try:
                _generate(db, fin, fout, backoff)
            except IOError:
                sys.stderr.write('Error: An IO error occurred.\n')
                sys.exit(1)

        elif reinflect:
            try:
                _reinflect(db, fin, fout)
            except IOError:
                sys.stderr.write('Error: An IO error occurred.\n')
                sys.exit(1)

        sys.exit(0)

    except KeyboardInterrupt:
        sys.stderr.write('Exiting...\n')
        sys.exit(1)
    except Exception:
        sys.stderr.write('Error: An unknown error occurred.\n')
        sys.exit(1)