def test_pretokenized(): nlp = classla.Pipeline(**{'processors': 'tokenize', 'models_dir': '.', 'lang': 'en', 'tokenize_pretokenized': True}) doc = nlp(EN_DOC_PRETOKENIZED) assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) doc = nlp(EN_DOC_PRETOKENIZED_LIST) assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
def test_pretokenized(): nlp = classla.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'sl', 'tokenize_pretokenized': True}) doc = nlp(SL_DOC_PRETOKENIZED) assert SL_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences]) doc = nlp(SL_DOC_PRETOKENIZED_LIST) assert SL_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
def main(): nlp = classla.Pipeline('en', processors='tokenize,pos,lemma,depparse') doc = nlp('Unban Mox Opal! Unban Mox Opal!') #print(doc.sentences[0].dependencies) print(doc) print(process_doc(doc, "{}=source >obj=zzz {}=target"))
def test_tokenize(): nlp = classla.Pipeline(processors='tokenize', models_dir=TEST_MODELS_DIR, lang='sl') doc = nlp(SL_DOC) assert SL_DOC_GOLD_TOKENS == '\n\n'.join( [sent.tokens_string() for sent in doc.sentences])
def test_missing_requirements(): """ Try to build several pipelines with bad configs and check thrown exceptions against gold exceptions. :return: None """ # list of (bad configs, list of gold ProcessorRequirementsExceptions that should be thrown) pairs bad_config_lists = [ # missing tokenize ( # input config { 'processors': 'pos,depparse', 'models_dir': TEST_MODELS_DIR, 'lang': 'en' }, # 2 expected exceptions [{ 'processor_type': 'POSProcessor', 'processors_list': ['pos', 'depparse'], 'provided_reqs': set([]), 'requires': set(['tokenize']) }, { 'processor_type': 'DepparseProcessor', 'processors_list': ['pos', 'depparse'], 'provided_reqs': set([]), 'requires': set(['tokenize', 'pos']) }]), # no pos when lemma_pos set to True ( # input config { 'processors': 'tokenize,mwt,lemma', 'models_dir': TEST_MODELS_DIR, 'lang': 'en', 'lemma_pos': True }, # 1 expected exception [{ 'processor_type': 'LemmaProcessor', 'processors_list': ['tokenize', 'mwt', 'lemma'], 'provided_reqs': set(['tokenize', 'mwt']), 'requires': set(['tokenize', 'pos']) }]) ] # try to build each bad config, catch exceptions, check against gold pipeline_fails = 0 for bad_config, gold_exceptions in bad_config_lists: try: classla.Pipeline(**bad_config) except PipelineRequirementsException as e: pipeline_fails += 1 assert isinstance(e, PipelineRequirementsException) assert len(e.processor_req_fails) == len(gold_exceptions) for processor_req_e, gold_exception in zip(e.processor_req_fails, gold_exceptions): # compare the thrown ProcessorRequirementsExceptions against gold check_exception_vals(processor_req_e, gold_exception) # check pipeline building failed twice assert pipeline_fails == 2
def test_ner(): nlp = classla.Pipeline(**{ 'processors': 'tokenize,ner', 'models_dir': TEST_MODELS_DIR, 'lang': 'sl' }) doc = nlp(SL_DOC) assert SL_DOC_GOLD == doc.conll_file.conll_as_string()
def _check_model(self, lang): """ Check if model exists, is loaded, and load it if needed. """ if lang not in self.models: lang_code = self._lang2modelname(lang) self.models[lang] = classla.Pipeline(lang=lang_code, models_dir=os.path.join( FILE_PATH, 'classla', 'models'))
def test_part_of_speech(): nlp = classla.Pipeline(**{ 'processors': 'tokenize,pos', 'dir': TEST_MODELS_DIR, 'lang': 'sl' }) doc = nlp(SL_DOC) assert SL_DOC_GOLD == '\n\n'.join( [sent.tokens_string() for sent in doc.sentences])
def test_parser(): nlp = classla.Pipeline( **{ 'processors': 'tokenize,pos,lemma,depparse', 'dir': TEST_MODELS_DIR, 'lang': 'sl' }) doc = nlp(SL_DOC) assert SL_DOC_GOLD == doc.to_conll()
def test_parser(): nlp = classla.Pipeline( **{ 'processors': 'tokenize,pos,lemma,depparse', 'models_dir': TEST_MODELS_DIR, 'lang': 'en' }) doc = nlp(EN_DOC) assert EN_DOC_GOLD == '\n\n'.join( [sent.dependencies_string() for sent in doc.sentences])
def test_depparse_with_pretagged_doc(): nlp = classla.Pipeline(**{'processors': 'depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'depparse_pretagged': True}) doc, metasentences = CoNLL.conll2dict(input_file=EN_DOC_CONLLU_PRETAGGED) doc = classla.Document(doc, metasentences=metasentences) processed_doc = nlp(doc) assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join( [sent.dependencies_string() for sent in processed_doc.sentences])
def test_ner(): nlp = classla.Pipeline( **{ 'processors': 'tokenize,ner', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'logging_level': 'error' }) doc = nlp(EN_DOC) assert EN_DOC_GOLD == '\n'.join([ent.pretty_print() for ent in doc.ents])
def test_full_lemmatizer(): nlp = classla.Pipeline( **{ 'processors': 'tokenize,pos,lemma', 'models_dir': TEST_MODELS_DIR, 'lang': 'sl' }) doc = nlp(SL_DOC) assert SL_DOC_LEMMATIZER_MODEL_GOLD == '\n\n'.join( [sent.tokens_string() for sent in doc.sentences])
def test_identity_lemmatizer(): nlp = classla.Pipeline( **{ 'processors': 'tokenize,lemma', 'models_dir': TEST_MODELS_DIR, 'lang': 'sl', 'lemma_use_identity': True }) doc = nlp(SL_DOC) assert SL_DOC_IDENTITY_GOLD == '\n\n'.join( [sent.tokens_string() for sent in doc.sentences])
def test_full_lemmatizer(): nlp = classla.Pipeline(**{ 'processors': 'tokenize,pos,lemma', 'dir': TEST_MODELS_DIR, 'lang': 'en' }) doc = nlp(EN_DOC) word_lemma_pairs = [] for w in doc.iter_words(): word_lemma_pairs += [f"{w.text} {w.lemma}"] assert EN_DOC_LEMMATIZER_MODEL_GOLD == "\n".join(word_lemma_pairs)
def test_tokenize_ssplit_robustness(): nlp = classla.Pipeline(processors='tokenize', dir=TEST_MODELS_DIR, lang='en') doc = nlp(EN_DOC_WITH_EXTRA_WHITESPACE) assert EN_DOC_GOLD_TOKENS == '\n\n'.join( [sent.tokens_string() for sent in doc.sentences]) assert all([ doc.text[token._start_char:token._end_char] == token.text for sent in doc.sentences for token in sent.tokens ])
def test_ner(): nlp = classla.Pipeline(**{'processors': 'tokenize,ner', 'dir': TEST_MODELS_DIR, 'lang': 'sl'}) doc = nlp(SL_DOC) gold = SL_DOC_GOLD.split('\n') me = doc.to_conll().split('\n') assert len(gold) == len(me) for g, m in zip(gold, me): assert g == m
def classlaSerbian(): color_print('\nClassla - srpski', color='blue', bold=True, underline=True) f = open("federer_srb2.txt", "r", encoding="utf8") text = f.read() nlp = classla.Pipeline('sr') doc = nlp(text) tableEnt = PrettyTable(["Prepoznati entitet", "Tip entiteta"]) for sentence in doc.sentences: for word in sentence.tokens: if word.ner != "O": tableEnt.add_row([word.text, word.ner]) print(tableEnt)
def __init__(self, use_gpu=True): self.use_gpu = use_gpu # Set up stanfordnlp pipeline self.classla_pipeline = classla.Pipeline('sl', pos_use_lexicon=True, use_gpu=use_gpu) self.eu_term_annotator = EUTermAnnotator() self.doc_classifier = DocClassifier() self.meta_fields = ['language', 'date', 'title', 'type', 'entype']
def test_identity_lemmatizer(): nlp = classla.Pipeline( **{ 'processors': 'tokenize,lemma', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'lemma_use_identity': True }) doc = nlp(EN_DOC) word_lemma_pairs = [] for w in doc.iter_words(): word_lemma_pairs += [f"{w.text} {w.lemma}"] assert EN_DOC_IDENTITY_GOLD == "\n".join(word_lemma_pairs)
def slo_preprocessing(dataset, remove_stopwords=True, do_lemmatization=True): # do base proccesing dataset['preprocessed'] = dataset['Text'].apply(base_preprocessing) # create pipelines tokenizer = classla.Pipeline('sl', processors='tokenize', type='nonstandard', logging_level='WARN') lemmatizer = classla.Pipeline('sl', processors='tokenize, lemma', type='nonstandard', logging_level='WARN') # do tokenization documents = '\n'.join(dataset['preprocessed'].values) out_docs = tokenizer(documents) for i, sentence in enumerate(out_docs.sentences): #print("DOCUMENT") seq = [] for word in sentence.words: if not remove_stopwords or word.text not in slo_stopwords: seq.append(word.text) dataset.at[i, 'preprocessed'] = ' '.join(seq) # do lemmatization if do_lemmatization: documents = '\n'.join(dataset['preprocessed'].values) out_docs = lemmatizer(documents) for i, sentence in enumerate(out_docs.sentences): dataset.at[i, 'preprocessed'] = ' '.join(word.lemma for word in sentence.words) return dataset
def test_jieba(): nlp = classla.Pipeline(lang='zh', dir=TEST_MODELS_DIR, processors={'tokenize': 'jieba'}, package=None) doc = nlp(ZH_DOC) assert "JiebaTokenizer" == nlp.processors[ 'tokenize']._variant.__class__.__name__ assert ZH_DOC_GOLD_TOKENS == '\n\n'.join( [sent.tokens_string() for sent in doc.sentences]) assert all([ doc.text[token._start_char:token._end_char] == token.text for sent in doc.sentences for token in sent.tokens ])
def test_mwt(): pipeline = classla.Pipeline(processors='tokenize,mwt', dir=TEST_MODELS_DIR, lang='fr') doc = pipeline(FR_MWT_SENTENCE) token_to_words = "\n".join([ f'token: {token.text.ljust(9)}\t\twords: [{", ".join([word.pretty_print() for word in token.words])}]' for sent in doc.sentences for token in sent.tokens ]).strip() word_to_token = "\n".join([ f'word: {word.text.ljust(9)}\t\ttoken parent:{"-".join([str(x) for x in word.parent.id])}-{word.parent.text}' for sent in doc.sentences for word in sent.words ]).strip() assert token_to_words == FR_MWT_TOKEN_TO_WORDS_GOLD assert word_to_token == FR_MWT_WORD_TO_TOKEN_GOLD
def test_mwt(): pipeline = classla.Pipeline(processors='tokenize,mwt', models_dir=TEST_MODELS_DIR, lang='fr') doc = pipeline(FR_MWT_SENTENCE) token_to_words = "\n".join([ f'token: {token.text.ljust(9)}\t\twords: {token.words}' for sent in doc.sentences for token in sent.tokens ]).strip() word_to_token = "\n".join([ f'word: {word.text.ljust(9)}\t\ttoken parent:{word.parent_token.index+"-"+word.parent_token.text}' for sent in doc.sentences for word in sent.words ]).strip() assert token_to_words == FR_MWT_TOKEN_TO_WORDS_GOLD assert word_to_token == FR_MWT_WORD_TO_TOKEN_GOLD
def test_spacy(): nlp = classla.Pipeline(processors='tokenize', dir=TEST_MODELS_DIR, lang='en', tokenize_with_spacy=True) doc = nlp(EN_DOC) # make sure the loaded tokenizer is actually spacy assert "SpacyTokenizer" == nlp.processors[ 'tokenize']._variant.__class__.__name__ assert EN_DOC_GOLD_TOKENS == '\n\n'.join( [sent.tokens_string() for sent in doc.sentences]) assert all([ doc.text[token._start_char:token._end_char] == token.text for sent in doc.sentences for token in sent.tokens ])
def test_no_ssplit(): nlp = classla.Pipeline( **{ 'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'tokenize_no_ssplit': True }) doc = nlp(EN_DOC_NO_SSPLIT) assert EN_DOC_NO_SSPLIT_SENTENCES == [[w.text for w in s.words] for s in doc.sentences] assert all([ doc.text[token._start_char:token._end_char] == token.text for sent in doc.sentences for token in sent.tokens ])
def annotate(): global pipelineCache properties = request.args.get('properties', '') lang = request.args.get('pipelineLanguage', '') text = list(request.form.keys())[0] if lang not in pipelineCache: pipelineCache[lang] = classla.Pipeline(lang=lang, use_gpu=False) res = pipelineCache[lang](text) annotated_sentences = [] for sentence in res.sentences: tokens = [] deps = [] for word in sentence.words: tokens.append({ 'index': word.id, 'word': word.text, 'lemma': word.lemma, 'pos': word.xpos, 'upos': word.upos, 'feats': word.feats, 'ner': word.parent.ner if word.parent.ner is None or word.parent.ner == 'O' else word.parent.ner[2:] }) deps.append({ 'dep': word.deprel, 'governor': word.head, 'governorGloss': sentence.words[word.head - 1].text, 'dependent': word.id, 'dependentGloss': word.text }) annotated_sentences.append({ 'basicDependencies': deps, 'tokens': tokens }) return json.dumps({'sentences': annotated_sentences})
def test_sl_pretokenized_conllu(): classla.download('sl', dir=TEST_MODELS_DIR) nlp = classla.Pipeline('sl', tokenize_pretokenized='conllu', dir=TEST_MODELS_DIR) conllu_pretokenized = """ # newpar id = 1 # sent_id = 1.1 # text = France Prešeren je rojen v Vrbi. 1 France France _ _ _ _ _ _ _ 2 Prešeren Prešeren _ _ _ _ _ _ _ 3 je biti _ _ _ _ _ _ _ 4 rojen rojen _ _ _ _ _ _ _ 5 v v _ _ _ _ _ _ _ 6 Vrbi Vrba _ _ _ _ _ _ SpaceAfter=No 7 . . _ _ _ _ _ _ _ """ doc = nlp(conllu_pretokenized) assert doc.to_conll().strip() == SL_STANDARD_CONLL
def lemmatize_wordlist(input_path, output_path): """ Lemmatization of a given word list Args: input_path (string): input path to a word list output_path (string): output path to a word list """ delete_if_exists(output_path) print('Lemmatizing ' + str(input_path) + ' in Croatian language.') with open(input_path, 'r', encoding = 'utf-8') as csv_read, \ open(output_path, 'a', encoding = 'utf-8') as csv_write: csv_reader = csv.reader(csv_read, delimiter = ' ') csv_writer = csv.writer(csv_write, delimiter=' ', quotechar='"', quoting = csv.QUOTE_MINIMAL, lineterminator='\n') # Classla processor nlp = classla.Pipeline(lang='hr', processors='lemma, tokenize, pos', use_gpu=False) print('Lemmatization of ' + input_path + ' started...') for row in csv_reader: row_word = row[0] word_weight = row[1] expression = nlp(row_word) # Change the word into its lemmatized form lem_word = [word.lemma for sent in expression.sentences for word in sent.words] lem_word = ('').join(lem_word) csv_writer.writerow([lem_word, word_weight]) print('Lemmatized file saved at: ' + output_path)
def main(): xml_directory = sys.argv[1] out_directory = sys.argv[2] sentences = [] for filename in glob.glob(xml_directory + '/xml/cet_*xml'): sentences.extend(get_phrases(filename)) nlp = classla.Pipeline('zh', processors='tokenize') snippets = [] for sentence in sentences: doc = nlp(sentence.text) text = " ".join(" ".join(token.text for token in sentence.tokens) for sentence in doc.sentences) snippets.append(sentence.sentiment + " " + text) print("Found {} phrases".format(len(snippets))) random.seed(1000) random.shuffle(snippets) process_utils.write_splits(out_directory, snippets, (process_utils.Split( "train.txt", 0.8), process_utils.Split( "dev.txt", 0.1), process_utils.Split("test.tmp", 0.1)))