def main(): # pragma: no cover try: version = ('CAMeL Tools v{}'.format(__version__)) arguments = docopt(__doc__, version=version) if arguments['--list']: for dl in DataCatalogue.downloads_list(): print("{}\t{}\t{}".format(dl.name, dl.size, dl.description)) sys.exit(0) package_name = arguments.get('<PACKAGE>', None) try: dl_info = DataCatalogue.get_download_info(package_name) except: sys.stderr.write('Error: Invalid package name. Run `camel_data -l`' ' to get a list of available packages.\n') sys.exit(1) try: DataDownloader.download(dl_info) except DownloaderError as e: sys.stderr.write('Error: {}\n'.format(e.msg)) sys.exit(1) except KeyboardInterrupt: sys.stderr.write('Exiting...\n') sys.exit(1) except Exception: sys.stderr.write('Error: An unknown error occurred.\n') sys.exit(1)
def pretrained(model_name=None, analyzer=None, top=1, cache_size=100000): """Load a pre-trained MLE disambiguator provided with CAMeL Tools. Args: model_name (:obj:`str`, optional): The name of the pretrained model. If none, the default model ('calima-msa-r13') is loaded. At the moment, the model names available are the same as those in :ref:`camel_morphology_dbs`. Defaults to None. analyzer (:obj:`Analyzer`, optional): Alternative analyzer to use. If None, an instance of the model's default analyzer is created. Defaults to None. top (:obj:`int`, optional): The maximum number of top analyses to return. Defaults to 1. cache_size (:obj:`int`, optional): The number of unique word disambiguations to cache. The cache uses a least-frequently-used eviction policy. Defaults to 100000. Returns: :obj:`MLEDisambiguator`: The loaded MLE disambiguator. """ model_info = DataCatalogue.get_dataset_info('DisambigMLE', model_name) mle_path = model_info.path / 'model.json' if analyzer is None: analyzer = _MLE_ANALYZER_MAP[model_info.name]() return MLEDisambiguator(analyzer, str(mle_path), top, cache_size)
def list_builtin_dbs(): """Returns a list of builtin databases provided with CAMeL Tools. Returns: :obj:`list` of :obj:`~camel_tools.data.DatasetInfo`: List of builtin databases. """ return list(DataCatalogue.get_component_info('MorphologyDB').datasets)
def pretrained(model_name=None): """Load a pre-trained model provided with camel_tools. Args: model_name (:obj:`str`, optional): Name of pre-trained model to load. One model is available: 'arabert'. If None, the default model ('arabert') will be loaded. Defaults to None. Returns: :obj:`NERecognizer`: Instance with loaded pre-trained model. """ model_info = DataCatalogue.get_dataset_info('NamedEntityRecognition', model_name) model_path = str(model_info.path) return NERecognizer(model_path)
def pretrained(model_name=None): """Load a pre-trained model provided with camel_tools. Args: model_name (:obj:`str`, optional): Name of pre-trained model to load. Two models are available: 'arabert' and 'mbert'. If None, the default model ('arabert') will be loaded. Defaults to None. Returns: :obj:`SentimentAnalyzer`: Instance with loaded pre-trained model. """ model_info = DataCatalogue.get_dataset_info('SentimentAnalysis', model_name) model_path = str(model_info.path) return SentimentAnalyzer(model_path)
def builtin_db(db_name='calima-msa-r13', flags='a'): """Create a :obj:`MorphologyDB` instance from one of the builtin databases provided. Args: db_name (:obj:`str`, optional): Name of builtin database. You can use :meth:`list_builtin_dbs` to get a list of builtin databases or see :ref:`camel_morphology_dbs`. Defaults to 'calima-msa-r13'. flags (:obj:`str`, optional): Flag string to be passed to :obj:`MorphologyDB` constructor. Defaults to 'a'. Returns: :obj:`MorphologyDB`: Instance of builtin database with given flags. """ db_info = DataCatalogue.get_dataset_info('MorphologyDB', db_name) return MorphologyDB(str(Path(db_info.path, 'morphology.db')), flags)
'JED': 'Gulf', 'JER': 'Levant', 'KHA': 'Nile Basin', 'MOS': 'Gulf', 'MSA': 'Modern Standard Arabic', 'MUS': 'Gulf', 'RAB': 'Maghreb', 'RIY': 'Gulf', 'SAL': 'Levant', 'SAN': 'Gulf of Aden', 'SFX': 'Maghreb', 'TRI': 'Maghreb', 'TUN': 'Maghreb' } _DATA_DIR = DataCatalogue.get_dataset_info('DialectID').path _CHAR_LM_DIR = Path(_DATA_DIR, 'lm', 'char') _WORD_LM_DIR = Path(_DATA_DIR, 'lm', 'word') _TRAIN_DATA_PATH = Path(_DATA_DIR, 'corpus_26_train.tsv') _TRAIN_DATA_EXTRA_PATH = Path(_DATA_DIR, 'corpus_6_train.tsv') _DEV_DATA_PATH = Path(_DATA_DIR, 'corpus_26_dev.tsv') _TEST_DATA_PATH = Path(_DATA_DIR, 'corpus_26_test.tsv') class DIDPred(collections.namedtuple('DIDPred', ['top', 'scores'])): """A named tuple containing dialect ID prediction results. Attributes: top (:obj:`str`): The dialect label with the highest score. See :ref:`dialectid_labels` for a list of output labels. scores (:obj:`dict`): A dictionary mapping each dialect label to it's