def test_pretokenized():
    nlp = classla.Pipeline(**{'processors': 'tokenize', 'models_dir': '.', 'lang': 'en',
                                  'tokenize_pretokenized': True})
    doc = nlp(EN_DOC_PRETOKENIZED)
    assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
    doc = nlp(EN_DOC_PRETOKENIZED_LIST)
    assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
Exemple #2
0
def test_pretokenized():
    nlp = classla.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'sl',
                                  'tokenize_pretokenized': True})
    doc = nlp(SL_DOC_PRETOKENIZED)
    assert SL_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
    doc = nlp(SL_DOC_PRETOKENIZED_LIST)
    assert SL_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
Exemple #3
0
def main():
    nlp = classla.Pipeline('en', processors='tokenize,pos,lemma,depparse')

    doc = nlp('Unban Mox Opal! Unban Mox Opal!')
    #print(doc.sentences[0].dependencies)
    print(doc)
    print(process_doc(doc, "{}=source >obj=zzz {}=target"))
def test_tokenize():
    nlp = classla.Pipeline(processors='tokenize',
                           models_dir=TEST_MODELS_DIR,
                           lang='sl')
    doc = nlp(SL_DOC)
    assert SL_DOC_GOLD_TOKENS == '\n\n'.join(
        [sent.tokens_string() for sent in doc.sentences])
Exemple #5
0
def test_missing_requirements():
    """
    Try to build several pipelines with bad configs and check thrown exceptions against gold exceptions.
    :return: None
    """
    # list of (bad configs, list of gold ProcessorRequirementsExceptions that should be thrown) pairs
    bad_config_lists = [
        # missing tokenize
        (
            # input config
            {
                'processors': 'pos,depparse',
                'models_dir': TEST_MODELS_DIR,
                'lang': 'en'
            },
            # 2 expected exceptions
            [{
                'processor_type': 'POSProcessor',
                'processors_list': ['pos', 'depparse'],
                'provided_reqs': set([]),
                'requires': set(['tokenize'])
            }, {
                'processor_type': 'DepparseProcessor',
                'processors_list': ['pos', 'depparse'],
                'provided_reqs': set([]),
                'requires': set(['tokenize', 'pos'])
            }]),
        # no pos when lemma_pos set to True
        (
            # input config
            {
                'processors': 'tokenize,mwt,lemma',
                'models_dir': TEST_MODELS_DIR,
                'lang': 'en',
                'lemma_pos': True
            },
            # 1 expected exception
            [{
                'processor_type': 'LemmaProcessor',
                'processors_list': ['tokenize', 'mwt', 'lemma'],
                'provided_reqs': set(['tokenize', 'mwt']),
                'requires': set(['tokenize', 'pos'])
            }])
    ]
    # try to build each bad config, catch exceptions, check against gold
    pipeline_fails = 0
    for bad_config, gold_exceptions in bad_config_lists:
        try:
            classla.Pipeline(**bad_config)
        except PipelineRequirementsException as e:
            pipeline_fails += 1
            assert isinstance(e, PipelineRequirementsException)
            assert len(e.processor_req_fails) == len(gold_exceptions)
            for processor_req_e, gold_exception in zip(e.processor_req_fails,
                                                       gold_exceptions):
                # compare the thrown ProcessorRequirementsExceptions against gold
                check_exception_vals(processor_req_e, gold_exception)
    # check pipeline building failed twice
    assert pipeline_fails == 2
def test_ner():
    nlp = classla.Pipeline(**{
        'processors': 'tokenize,ner',
        'models_dir': TEST_MODELS_DIR,
        'lang': 'sl'
    })
    doc = nlp(SL_DOC)
    assert SL_DOC_GOLD == doc.conll_file.conll_as_string()
Exemple #7
0
 def _check_model(self, lang):
     """ Check if model exists, is loaded, and load it if needed. """
     if lang not in self.models:
         lang_code = self._lang2modelname(lang)
         self.models[lang] = classla.Pipeline(lang=lang_code,
                                              models_dir=os.path.join(
                                                  FILE_PATH, 'classla',
                                                  'models'))
Exemple #8
0
def test_part_of_speech():
    nlp = classla.Pipeline(**{
        'processors': 'tokenize,pos',
        'dir': TEST_MODELS_DIR,
        'lang': 'sl'
    })
    doc = nlp(SL_DOC)
    assert SL_DOC_GOLD == '\n\n'.join(
        [sent.tokens_string() for sent in doc.sentences])
Exemple #9
0
def test_parser():
    nlp = classla.Pipeline(
        **{
            'processors': 'tokenize,pos,lemma,depparse',
            'dir': TEST_MODELS_DIR,
            'lang': 'sl'
        })
    doc = nlp(SL_DOC)
    assert SL_DOC_GOLD == doc.to_conll()
Exemple #10
0
def test_parser():
    nlp = classla.Pipeline(
        **{
            'processors': 'tokenize,pos,lemma,depparse',
            'models_dir': TEST_MODELS_DIR,
            'lang': 'en'
        })
    doc = nlp(EN_DOC)
    assert EN_DOC_GOLD == '\n\n'.join(
        [sent.dependencies_string() for sent in doc.sentences])
Exemple #11
0
def test_depparse_with_pretagged_doc():
    nlp = classla.Pipeline(**{'processors': 'depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en',
                                  'depparse_pretagged': True})

    doc, metasentences = CoNLL.conll2dict(input_file=EN_DOC_CONLLU_PRETAGGED)
    doc = classla.Document(doc, metasentences=metasentences)
    processed_doc = nlp(doc)

    assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join(
        [sent.dependencies_string() for sent in processed_doc.sentences])
Exemple #12
0
def test_ner():
    nlp = classla.Pipeline(
        **{
            'processors': 'tokenize,ner',
            'dir': TEST_MODELS_DIR,
            'lang': 'en',
            'logging_level': 'error'
        })
    doc = nlp(EN_DOC)
    assert EN_DOC_GOLD == '\n'.join([ent.pretty_print() for ent in doc.ents])
Exemple #13
0
def test_full_lemmatizer():
    nlp = classla.Pipeline(
        **{
            'processors': 'tokenize,pos,lemma',
            'models_dir': TEST_MODELS_DIR,
            'lang': 'sl'
        })
    doc = nlp(SL_DOC)
    assert SL_DOC_LEMMATIZER_MODEL_GOLD == '\n\n'.join(
        [sent.tokens_string() for sent in doc.sentences])
Exemple #14
0
def test_identity_lemmatizer():
    nlp = classla.Pipeline(
        **{
            'processors': 'tokenize,lemma',
            'models_dir': TEST_MODELS_DIR,
            'lang': 'sl',
            'lemma_use_identity': True
        })
    doc = nlp(SL_DOC)
    assert SL_DOC_IDENTITY_GOLD == '\n\n'.join(
        [sent.tokens_string() for sent in doc.sentences])
Exemple #15
0
def test_full_lemmatizer():
    nlp = classla.Pipeline(**{
        'processors': 'tokenize,pos,lemma',
        'dir': TEST_MODELS_DIR,
        'lang': 'en'
    })
    doc = nlp(EN_DOC)
    word_lemma_pairs = []
    for w in doc.iter_words():
        word_lemma_pairs += [f"{w.text} {w.lemma}"]
    assert EN_DOC_LEMMATIZER_MODEL_GOLD == "\n".join(word_lemma_pairs)
Exemple #16
0
def test_tokenize_ssplit_robustness():
    nlp = classla.Pipeline(processors='tokenize',
                           dir=TEST_MODELS_DIR,
                           lang='en')
    doc = nlp(EN_DOC_WITH_EXTRA_WHITESPACE)
    assert EN_DOC_GOLD_TOKENS == '\n\n'.join(
        [sent.tokens_string() for sent in doc.sentences])
    assert all([
        doc.text[token._start_char:token._end_char] == token.text
        for sent in doc.sentences for token in sent.tokens
    ])
Exemple #17
0
def test_ner():
    nlp = classla.Pipeline(**{'processors': 'tokenize,ner', 'dir': TEST_MODELS_DIR, 'lang': 'sl'})
    doc = nlp(SL_DOC)

    gold = SL_DOC_GOLD.split('\n')
    me = doc.to_conll().split('\n')

    assert len(gold) == len(me)

    for g, m in zip(gold, me):
        assert g == m
def classlaSerbian():
    color_print('\nClassla - srpski', color='blue', bold=True, underline=True)
    f = open("federer_srb2.txt", "r", encoding="utf8")
    text = f.read()
    nlp = classla.Pipeline('sr')
    doc = nlp(text)
    tableEnt = PrettyTable(["Prepoznati entitet", "Tip entiteta"])
    for sentence in doc.sentences:
        for word in sentence.tokens:
            if word.ner != "O":
                tableEnt.add_row([word.text, word.ner])
    print(tableEnt)
    def __init__(self, use_gpu=True):
        self.use_gpu = use_gpu

        # Set up stanfordnlp pipeline
        self.classla_pipeline = classla.Pipeline('sl',
                                                 pos_use_lexicon=True,
                                                 use_gpu=use_gpu)

        self.eu_term_annotator = EUTermAnnotator()
        self.doc_classifier = DocClassifier()

        self.meta_fields = ['language', 'date', 'title', 'type', 'entype']
Exemple #20
0
def test_identity_lemmatizer():
    nlp = classla.Pipeline(
        **{
            'processors': 'tokenize,lemma',
            'dir': TEST_MODELS_DIR,
            'lang': 'en',
            'lemma_use_identity': True
        })
    doc = nlp(EN_DOC)
    word_lemma_pairs = []
    for w in doc.iter_words():
        word_lemma_pairs += [f"{w.text} {w.lemma}"]
    assert EN_DOC_IDENTITY_GOLD == "\n".join(word_lemma_pairs)
Exemple #21
0
def slo_preprocessing(dataset, remove_stopwords=True, do_lemmatization=True):

    # do base proccesing
    dataset['preprocessed'] = dataset['Text'].apply(base_preprocessing)

    # create pipelines
    tokenizer = classla.Pipeline('sl',
                                 processors='tokenize',
                                 type='nonstandard',
                                 logging_level='WARN')
    lemmatizer = classla.Pipeline('sl',
                                  processors='tokenize, lemma',
                                  type='nonstandard',
                                  logging_level='WARN')

    # do tokenization
    documents = '\n'.join(dataset['preprocessed'].values)
    out_docs = tokenizer(documents)

    for i, sentence in enumerate(out_docs.sentences):
        #print("DOCUMENT")
        seq = []
        for word in sentence.words:
            if not remove_stopwords or word.text not in slo_stopwords:
                seq.append(word.text)

        dataset.at[i, 'preprocessed'] = ' '.join(seq)

    # do lemmatization
    if do_lemmatization:
        documents = '\n'.join(dataset['preprocessed'].values)
        out_docs = lemmatizer(documents)

        for i, sentence in enumerate(out_docs.sentences):
            dataset.at[i,
                       'preprocessed'] = ' '.join(word.lemma
                                                  for word in sentence.words)

    return dataset
Exemple #22
0
def test_jieba():
    nlp = classla.Pipeline(lang='zh',
                           dir=TEST_MODELS_DIR,
                           processors={'tokenize': 'jieba'},
                           package=None)
    doc = nlp(ZH_DOC)

    assert "JiebaTokenizer" == nlp.processors[
        'tokenize']._variant.__class__.__name__
    assert ZH_DOC_GOLD_TOKENS == '\n\n'.join(
        [sent.tokens_string() for sent in doc.sentences])
    assert all([
        doc.text[token._start_char:token._end_char] == token.text
        for sent in doc.sentences for token in sent.tokens
    ])
Exemple #23
0
def test_mwt():
    pipeline = classla.Pipeline(processors='tokenize,mwt',
                                dir=TEST_MODELS_DIR,
                                lang='fr')
    doc = pipeline(FR_MWT_SENTENCE)
    token_to_words = "\n".join([
        f'token: {token.text.ljust(9)}\t\twords: [{", ".join([word.pretty_print() for word in token.words])}]'
        for sent in doc.sentences for token in sent.tokens
    ]).strip()
    word_to_token = "\n".join([
        f'word: {word.text.ljust(9)}\t\ttoken parent:{"-".join([str(x) for x in word.parent.id])}-{word.parent.text}'
        for sent in doc.sentences for word in sent.words
    ]).strip()
    assert token_to_words == FR_MWT_TOKEN_TO_WORDS_GOLD
    assert word_to_token == FR_MWT_WORD_TO_TOKEN_GOLD
def test_mwt():
    pipeline = classla.Pipeline(processors='tokenize,mwt',
                                models_dir=TEST_MODELS_DIR,
                                lang='fr')
    doc = pipeline(FR_MWT_SENTENCE)
    token_to_words = "\n".join([
        f'token: {token.text.ljust(9)}\t\twords: {token.words}'
        for sent in doc.sentences for token in sent.tokens
    ]).strip()
    word_to_token = "\n".join([
        f'word: {word.text.ljust(9)}\t\ttoken parent:{word.parent_token.index+"-"+word.parent_token.text}'
        for sent in doc.sentences for word in sent.words
    ]).strip()
    assert token_to_words == FR_MWT_TOKEN_TO_WORDS_GOLD
    assert word_to_token == FR_MWT_WORD_TO_TOKEN_GOLD
Exemple #25
0
def test_spacy():
    nlp = classla.Pipeline(processors='tokenize',
                           dir=TEST_MODELS_DIR,
                           lang='en',
                           tokenize_with_spacy=True)
    doc = nlp(EN_DOC)

    # make sure the loaded tokenizer is actually spacy
    assert "SpacyTokenizer" == nlp.processors[
        'tokenize']._variant.__class__.__name__
    assert EN_DOC_GOLD_TOKENS == '\n\n'.join(
        [sent.tokens_string() for sent in doc.sentences])
    assert all([
        doc.text[token._start_char:token._end_char] == token.text
        for sent in doc.sentences for token in sent.tokens
    ])
Exemple #26
0
def test_no_ssplit():
    nlp = classla.Pipeline(
        **{
            'processors': 'tokenize',
            'dir': TEST_MODELS_DIR,
            'lang': 'en',
            'tokenize_no_ssplit': True
        })

    doc = nlp(EN_DOC_NO_SSPLIT)
    assert EN_DOC_NO_SSPLIT_SENTENCES == [[w.text for w in s.words]
                                          for s in doc.sentences]
    assert all([
        doc.text[token._start_char:token._end_char] == token.text
        for sent in doc.sentences for token in sent.tokens
    ])
Exemple #27
0
def annotate():
    global pipelineCache

    properties = request.args.get('properties', '')
    lang = request.args.get('pipelineLanguage', '')
    text = list(request.form.keys())[0]

    if lang not in pipelineCache:
        pipelineCache[lang] = classla.Pipeline(lang=lang, use_gpu=False)

    res = pipelineCache[lang](text)

    annotated_sentences = []
    for sentence in res.sentences:
        tokens = []
        deps = []
        for word in sentence.words:
            tokens.append({
                'index':
                word.id,
                'word':
                word.text,
                'lemma':
                word.lemma,
                'pos':
                word.xpos,
                'upos':
                word.upos,
                'feats':
                word.feats,
                'ner':
                word.parent.ner if word.parent.ner is None
                or word.parent.ner == 'O' else word.parent.ner[2:]
            })
            deps.append({
                'dep': word.deprel,
                'governor': word.head,
                'governorGloss': sentence.words[word.head - 1].text,
                'dependent': word.id,
                'dependentGloss': word.text
            })
        annotated_sentences.append({
            'basicDependencies': deps,
            'tokens': tokens
        })

    return json.dumps({'sentences': annotated_sentences})
def test_sl_pretokenized_conllu():
    classla.download('sl', dir=TEST_MODELS_DIR)
    nlp = classla.Pipeline('sl', tokenize_pretokenized='conllu', dir=TEST_MODELS_DIR)
    conllu_pretokenized = """
# newpar id = 1
# sent_id = 1.1
# text = France Prešeren je rojen v Vrbi.
1	France	France	_	_	_	_	_	_	_
2	Prešeren	Prešeren	_	_	_	_	_	_	_
3	je	biti	_	_	_	_	_	_	_
4	rojen	rojen	_	_	_	_	_	_	_
5	v	v	_	_	_	_	_	_	_
6	Vrbi	Vrba	_	_	_	_	_	_	SpaceAfter=No
7	.	.	_	_	_	_	_	_	_

"""
    doc = nlp(conllu_pretokenized)
    assert doc.to_conll().strip() == SL_STANDARD_CONLL
def lemmatize_wordlist(input_path, output_path):
    """
    Lemmatization of a given word list

    Args:
        input_path (string): input path to a word list
        output_path (string): output path to a word list
    """
    delete_if_exists(output_path)

    print('Lemmatizing ' + str(input_path) + ' in Croatian language.')

    with open(input_path, 'r', encoding = 'utf-8') as csv_read, \
        open(output_path, 'a', encoding = 'utf-8') as csv_write:

        csv_reader = csv.reader(csv_read, delimiter = ' ')
        csv_writer = csv.writer(csv_write,
            delimiter=' ', 
            quotechar='"', 
            quoting = csv.QUOTE_MINIMAL, 
            lineterminator='\n')

        # Classla processor
        nlp = classla.Pipeline(lang='hr', processors='lemma, tokenize, pos', use_gpu=False)

        print('Lemmatization of ' + input_path + ' started...')

        for row in csv_reader:

            row_word = row[0]
            word_weight = row[1]

            expression = nlp(row_word)

            # Change the word into its lemmatized form
            lem_word = [word.lemma for sent in expression.sentences for word in sent.words]
            lem_word =  ('').join(lem_word)

            csv_writer.writerow([lem_word, word_weight])
        
    print('Lemmatized file saved at: ' + output_path)
Exemple #30
0
def main():
    xml_directory = sys.argv[1]
    out_directory = sys.argv[2]
    sentences = []
    for filename in glob.glob(xml_directory + '/xml/cet_*xml'):
        sentences.extend(get_phrases(filename))

    nlp = classla.Pipeline('zh', processors='tokenize')
    snippets = []
    for sentence in sentences:
        doc = nlp(sentence.text)
        text = " ".join(" ".join(token.text for token in sentence.tokens)
                        for sentence in doc.sentences)
        snippets.append(sentence.sentiment + " " + text)

    print("Found {} phrases".format(len(snippets)))
    random.seed(1000)
    random.shuffle(snippets)
    process_utils.write_splits(out_directory, snippets, (process_utils.Split(
        "train.txt", 0.8), process_utils.Split(
            "dev.txt", 0.1), process_utils.Split("test.tmp", 0.1)))