def __init__( self, vocab: Vocab, name: str = "morphologizer", *, overwrite_lemma: bool = False, ) -> None: super().__init__() self.name = name self.vocab = vocab self.voikko = libvoikko.Voikko("fi") self.lookups = Lookups() self.overwrite_lemma = overwrite_lemma self.aux_labels = [vocab.strings.add(x) for x in ["aux", "aux:pass"]] self.cop_labels = [vocab.strings.add(x) for x in ["cop", "cop:own"]] self.nsubj_labels = [ vocab.strings.add(x) for x in ["nsubj", "nsubj:cop"] ] self.ccomp_labels = [ vocab.strings.add(x) for x in ["csubj", "csubj:cop", "xcomp", "xcomp:ds"] ] self.relcl_labels = [ vocab.strings.add(x) for x in ["acl:relcl", "ccomp"] ] self.foreign_tag = vocab.strings.add('Foreign')
def main(): assert os.environ.get('LASER'), 'Please set the enviornment variable LASER' voikko = libvoikko.Voikko('fi') args = parse_args() hyperparameters = Hyperparameters(args.hyperparameters) if args.fast: hyperparameters.set_logreg() tasks = [ TDTCategoryClassificationTask('TDT categories', 'data/UD_Finnish-TDT', use_dev_set=args.dev_set, verbose=args.verbose), OpusparcusTask('Opusparcus', 'data/opusparcus/opusparcus_v1', use_dev_set=args.dev_set, verbose=args.verbose), YlilautaConsecutiveSentencesTask('Ylilauta', 'data/ylilauta', use_dev_set=args.dev_set, verbose=args.verbose), EduskuntaVKKClassificationTask('Eduskunta-VKK', 'data/eduskunta-vkk', use_dev_set=args.dev_set, verbose=args.verbose), ] models = [ model_tfidf(voikko), model_w2v(), model_fasttext(), model_sif(), model_borep(), model_finbert(), model_laser(os.environ['LASER'], args.verbose), ] print(f'Running evaluation on {len(tasks)} tasks and {len(models)} models') scores = [] for k in range(args.num_trials): if args.num_trials > 1: print(f'Trial {k+1}/{args.num_trials}') scores.append(evaluate_models(models, tasks, hyperparameters)) save_scores(scores, args.resultdir)
def __init__(self): self.name = 'Voikko' self.voikko = libvoikko.Voikko('fi') self.tag_map = { 'nimisana': 'NOUN', 'laatusana': 'ADJ', 'nimisana_laatusana': 'ADJ', 'teonsana': 'VERB', 'seikkasana': 'ADV', 'asemosana': 'PRON', 'suhdesana': 'ADP', 'huudahdussana': 'INTJ', 'sidesana': 'CCONJ', 'etunimi': 'PROPN', 'sukunimi': 'PROPN', 'paikannimi': 'PROPN', 'nimi': 'PROPN', 'kieltosana': 'AUX', 'lyhenne': 'ADV', 'lukusana': 'NUM', 'etuliite': 'X' }
from math import inf from typing import Callable, List, Optional, Tuple import cairocffi as cairo import numpy as np import pangocairocffi as pangocairo import pangocffi as pango from tqdm.cli import tqdm from voikko import libvoikko from .document import (Chapter, DocumentObj, Eval, Paragraph, Subenvironment, Table, VSpace, fixMarkup, stripMarkup) from .params import Parameters try: voikko = libvoikko.Voikko("fi") except: voikko = None debug = False def irange(a, b, s=1) -> range: return range(a, b + 1 if s > 0 else b - 1, s) FixXY = Callable[[float, float], Tuple[float, float]] class Line: outline: Optional[Tuple[int, str]]
def __init__(self): self.name = 'FinnPos' self.voikko = libvoikko.Voikko('fi')
def __init__(self, lookups, *args, **kwargs): super(FinnishLemmatizer, self).__init__(lookups, *args, **kwargs) self.voikko = libvoikko.Voikko("fi")
# but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import re from collections import defaultdict from voikko import libvoikko as lv from voikko.inflect_word import inflect_word from . import patternparser as pp DICTIONARY = defaultdict(list) voikko = lv.Voikko("fi-x-morpho") def tokenize(text): tokens = [] for token in voikko.tokens(text): if token.tokenType == lv.Token.WHITESPACE: continue if "-" in token.tokenText: index = token.tokenText.rindex("-") + 1 lastPart = token.tokenText[index:] baseformPrefix = token.tokenText[:index].lower() else: lastPart = token.tokenText baseformPrefix = "" alternatives = []
def tune(): voikko = libvoikko.Voikko('fi') tasks = [ TDTCategoryClassificationTask('TDT categories', 'data/UD_Finnish-TDT', use_dev_set=True), OpusparcusTask('Opusparcus', 'data/opusparcus/opusparcus_v1', use_dev_set=True), YlilautaConsecutiveSentencesTask('Ylilauta', 'data/ylilauta', use_dev_set=True), EduskuntaVKKClassificationTask('Eduskunta-VKK', 'data/eduskunta-vkk', use_dev_set=True), ] def model_w2v(): return PooledWord2Vec('Pooled word2vec', 'pretrained/fin-word2vec/fin-word2vec.bin') def model_fasttext(): return PooledFastText('Pooled FastText', 'pretrained/fasttext-fi/cc.fi.300.bin') def model_finbert(layers): return Bert('FinBERT', 'TurkuNLP/bert-base-finnish-cased-v1', layers) def model_tfidf(min_df): return TfidfVectors('TF-IDF', voikko, int(min_df)) def model_sif(): return SIF('SIF', 'data/finnish_vocab/finnish_vocab.txt.gz', 'pretrained/fin-word2vec/fin-word2vec.bin') def model_borep(): return BOREP('BOREP', 'pretrained/fin-word2vec/fin-word2vec.bin', 4096) def model_laser(): return Laser('LASER', os.path.join(os.getcwd(), 'LASER')) evaluations = itertools.chain( evaluations_for_model( model_w2v, tasks, { 'hidden_dim1': hp.quniform('hidden_dim1', 10, 300, 10), 'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8), }), evaluations_for_model( model_fasttext, tasks, { 'hidden_dim1': hp.quniform('hidden_dim1', 10, 300, 10), 'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8), }), evaluations_for_model( model_finbert, tasks, { 'hidden_dim1': hp.quniform('hidden_dim1', 30, 768, 10), 'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8), 'embedding_layers': hp.choice('embedding_layers', [[-1], [-2], [-3], [-4], [-1, -2, -3, -4]]), }), evaluations_for_model( model_tfidf, tasks, { 'hidden_dim1': hp.quniform('hidden_dim1', 30, 1000, 10), 'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8), 'embedding_min_df': hp.quniform('embedding_min_df', 2, 8, 2), }), evaluations_for_model( model_sif, tasks, { 'hidden_dim1': hp.quniform('hidden_dim1', 10, 300, 10), 'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8), }), evaluations_for_model( model_borep, tasks, { 'hidden_dim1': hp.quniform('hidden_dim1', 30, 300, 10), 'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8), }), evaluations_for_model( model_laser, tasks, { 'hidden_dim1': hp.quniform('hidden_dim1', 30, 300, 10), 'dropout_prop': hp.uniform('dropout_prop', 0.2, 0.8), })) os.makedirs('results', exist_ok=True) best_params = {} for kv in evaluations: task = kv['task'] embedding_model = None X_train = None y_train = None X_test = None y_test = None def objective(params): nonlocal embedding_model, X_train, y_train, X_test, y_test (embedding_params, classifier_params) = \ split_embedding_and_classifier_params(params) if embedding_params or embedding_model is None: if embedding_params: print('Reinitializing the embedding model ' 'because parameters have changed') builder = kv['embedding_model_builder'] embedding_model = builder(**embedding_params) X_train, y_train, X_test, y_test = \ task.prepare_data(embedding_model) print(f'{embedding_model.name}, {task.name}') print(params) clf = task.train_classifier(X_train, y_train, classifier_params) return -task.compute_optimization_score(clf, X_test, y_test) trials = Trials() fmin_res = fmin(fn=objective, space=kv['space'], algo=tpe.suggest, max_evals=50, trials=trials) best = space_eval(kv['space'], fmin_res) best_score = -np.min(trials.losses()) print( f'best score for {embedding_model.name} in task {task.name}: {best_score}' ) print('parameters:') print(best) best_params.setdefault(task.name, {})[embedding_model.name] = \ serialize_results(best, best_score) with open('results/hyperparameters.json', 'w') as f: json.dump(best_params, f, indent=2)
def __init__(self, vocab: Vocab, name: str = "lemmatizer", overwrite: bool = False) -> None: super().__init__(vocab, model=None, name=name, mode="voikko", overwrite=overwrite) self.voikko = libvoikko.Voikko("fi")
# This discards suffixes and some classes of complex compounds # GPL3 Copyright Théo Friberg 2018 from voikko import libvoikko from bs4 import BeautifulSoup import sys # Read the Kaino dataset f = open(sys.argv[1]) parsed = BeautifulSoup(f.read(), 'lxml') f.close() # Initialise voikko v = libvoikko.Voikko('fi') # Accumulate words into a seet words = set() for word in parsed.find_all('s'): # The s-tag in Kaino denotes a word s = word.string if s.lower() != s or "-" in s: # Discard suffixes and certain compounds continue analysis_ = v.analyze( s) # Analyse the word using Voikko; skip if Voikko gets confused if len(analysis_) == 0: continue