def test_sl_pretokenized_conllu(): classla.download('sl', dir=TEST_MODELS_DIR) nlp = classla.Pipeline('sl', tokenize_pretokenized='conllu', dir=TEST_MODELS_DIR) conllu_pretokenized = """ # newpar id = 1 # sent_id = 1.1 # text = France Prešeren je rojen v Vrbi. 1 France France _ _ _ _ _ _ _ 2 Prešeren Prešeren _ _ _ _ _ _ _ 3 je biti _ _ _ _ _ _ _ 4 rojen rojen _ _ _ _ _ _ _ 5 v v _ _ _ _ _ _ _ 6 Vrbi Vrba _ _ _ _ _ _ SpaceAfter=No 7 . . _ _ _ _ _ _ _ """ doc = nlp(conllu_pretokenized) assert doc.to_conll().strip() == SL_STANDARD_CONLL
if args.output is None: output_file_path = args.text_file + '.out' else: output_file_path = args.output # map language code to treebank shorthand if args.treebank is not None: treebank_shorthand = args.treebank else: treebank_shorthand = default_treebanks[args.language] # check for models print('checking for models...') lang_models_dir = '%s/%s_models' % (args.models_dir, treebank_shorthand) if not os.path.exists(lang_models_dir): print('could not find: ' + lang_models_dir) download(treebank_shorthand, resource_dir=args.models_dir, force=args.force_download) # set up pipeline pipeline_config = \ dict([(k, v) for k, v in vars(args).items() if k in PROCESSOR_SETTINGS_LIST and v is not None]) pipeline = Pipeline(processors=args.processors, treebank=treebank_shorthand, models_dir=args.models_dir, **pipeline_config) # build document print('running pipeline...') doc = pipeline(open(args.text_file).read()) # write conll to file doc.write_conll_to_file(output_file_path) print('done.') print('results written to: ' + output_file_path)
sentence_df = sentence_df[[ 'docId', 'sentenceId', 'tokenId', 'text', 'lemma', 'calcLemma', 'upos', 'xpos', 'ner', 'clID' ]] # leaving out 'misc' for now return sentence_df, warnings if __name__ == '__main__': datasets_files = json.load(open('./data/results/dataset_pairs.json')) languages = set([ lang for dataset in datasets_files for lang in datasets_files[dataset].keys() ]) print(languages) processors = 'tokenize,pos,lemma' if DOWNLOAD_RESOURCES: # do it once on a new system for lang in languages: lang = lang if lang != 'ua' else 'uk' print(f'Downloading {lang}...') stanza.download(lang, processors=processors) classla.download('sl') classla.download('bg') tokenizers = { lang: stanza.Pipeline(lang=lang if lang != 'ua' else 'uk', processors=processors) for lang in languages } tokenizers['sl'] = classla.Pipeline('sl', processors=processors) tokenizers['bg'] = classla.Pipeline('bg', processors=processors) split_documents(datasets_files, tokenizers)
example_sentences = { "sl": "France Prešeren je rojen v Vrbi.", "hr": "Ante Starčević rođen je u Velikom Žitniku.", "sr": "Slobodan Jovanović rođen je u Novom Sadu.", "bg": "Алеко Константинов е роден в Свищов." } if args.lang not in example_sentences: print( f'Sorry, but we don\'t have a demo sentence for "{args.lang}" for the moment. Try one of these languages: {list(example_sentences.keys())}' ) sys.exit(1) # download the models classla.download(args.lang, args.models_dir, confirm_if_exists=True) # set up a pipeline print('---') print('Building pipeline...') pipeline = classla.Pipeline(models_dir=args.models_dir, lang=args.lang, use_gpu=(not args.cpu)) # process the document doc = pipeline(example_sentences[args.lang]) # access nlp annotations print('') print('Input: {}'.format(example_sentences[args.lang])) print("The tokenizer split the input into {} sentences.".format( len(doc.sentences))) print('---') print('tokens of first sentence: ')
example_sentences = { "sl": "France Prešeren je rojen v Vrbi.", "hr": "Ante Starčević rođen je u Velikom Žitniku.", "sr": "Slobodan Jovanović rođen je u Novom Sadu.", "bg": "Алеко Константинов е роден в Свищов." } if args.lang not in example_sentences: print( f'Sorry, but we don\'t have a demo sentence for "{args.lang}" for the moment. Try one of these languages: {list(example_sentences.keys())}' ) sys.exit(1) # download the models classla.download(args.lang, args.models_dir) # set up a pipeline print('---') print('Building pipeline...') pipeline = classla.Pipeline(dir=args.models_dir, lang=args.lang, use_gpu=(not args.cpu)) # process the document doc = pipeline(example_sentences[args.lang]) # access nlp annotations print('') print('Input: {}'.format(example_sentences[args.lang])) print("The tokenizer split the input into {} sentences.".format( len(doc.sentences))) print('---') print('tokens of first sentence: ')
def test_sl_inflectional(): classla.download('sl', dir=TEST_MODELS_DIR) nlp = classla.Pipeline('sl', pos_use_lexicon=True, dir=TEST_MODELS_DIR) doc = nlp(SL_STANDARD) assert doc.to_conll().strip() == SL_STANDARD_CONLL
def test_sl_standard_jos(): classla.download('sl', type='standard_jos', dir=TEST_MODELS_DIR) nlp = classla.Pipeline('sl', type='standard_jos', dir=TEST_MODELS_DIR) doc = nlp(SL_STANDARD_JOS) assert doc.to_conll().strip() == SL_STANDARD_JOS_CONLL
def test_mk_standard(): classla.download('mk', dir=TEST_MODELS_DIR) nlp = classla.Pipeline('mk', dir=TEST_MODELS_DIR) doc = nlp(MK_STANDARD) assert doc.to_conll().strip() == MK_STANDARD_CONLL
def test_sr_nonstandard(): classla.download('sr', type='nonstandard', dir=TEST_MODELS_DIR) nlp = classla.Pipeline('sr', type='nonstandard', dir=TEST_MODELS_DIR) doc = nlp(SR_NONSTANDARD) assert doc.to_conll().strip() == SR_NONSTANDARD_CONLL
def test_all_downloads(): classla.download('sl', dir=TEST_MODELS_DIR) classla.download('sl', type='standard_jos', dir=TEST_MODELS_DIR) classla.download('sl', type='nonstandard', dir=TEST_MODELS_DIR) classla.download('hr', dir=TEST_MODELS_DIR) classla.download('hr', type='nonstandard', dir=TEST_MODELS_DIR) classla.download('sr', dir=TEST_MODELS_DIR) classla.download('sr', type='nonstandard', dir=TEST_MODELS_DIR) classla.download('bg', dir=TEST_MODELS_DIR) classla.download('mk', dir=TEST_MODELS_DIR)