def parse(self, text): # Lazy load model file to speed up startup if not self.model: self.model = self.load_model() text = text.strip() # Adding a period improves detection on especially short sentences period_added = False last_character = text.strip()[-1] if re.match(r"\w", last_character, flags=re.UNICODE): text += "." period_added = True pipeline = Pipeline( self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu" ) error = ProcessingError() processed = pipeline.process(text, error) if error.occurred(): raise ParserException(error.message) # Remove the period to make sure input corresponds to output if period_added: processed = "\n".join(processed.rstrip().split("\n")[:-1]) + "\n\n" return processed
def __init__(self, model): """Create the UDPipe tool object.""" self.model = model path = require_file(model) self.tool = Model.load(path) if not self.tool: raise IOError("Cannot load model from file '%s'" % path) self.error = ProcessingError() self.conllu_reader = ConlluReader() self.tokenizer = self.tool.newTokenizer(Model.DEFAULT)
def udpipe(conllu_in, model_path): model = Model.load(model_path) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % sys.argv[3]) sys.exit(1) pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() # Process data processed = pipeline.process(conllu_in, error) if error.occurred(): sys.stderr.write("An error occurred when running run_udpipe: ") sys.stderr.write(error.message) sys.stderr.write("\n") sys.exit(1) return processed
def run_udpipe(self, path_to_model, sents=None): if sents is None: sents = self.sents verticals = self._to_vertical(sents) model = Model.load(path_to_model) pipeline = Pipeline(model, "vertical", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() conllu = pipeline.process(verticals, error) return conllu
def load_udpipe(self, filename): """Load UDPipe model for statistical parsing. UDPipe can be used as extra information source for OOV symbols or all tokens. It works best with sentence-based analysis, token based does not keep track of context. @param filename path to UDPipe model """ if not can_udpipe: print("importing udpipe failed, cannot load udpipe xxx") return self.udpiper = Model.load(filename) # use pipeline for now, ugly but workable self.udpipeline = Pipeline(self.udpiper, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') self.uderror = ProcessingError() ## udpipe is loaded self.can_udpipe = True
def extract(self, sentence: str) -> List[Dict[str, Any]]: processed = self.pipeline.process(sentence, self._error) if self._error.occurred(): print(f"=== Error occurred: {self._error.message}") self._error = ProcessingError() return None else: conll_example = [ud_parse for sent_id, ud_parse in load_conllu(processed)][ 0 ] ppatt = PredPatt(conll_example, opts=self._opts) result = [] for predicate in ppatt.instances: structure = { "predicate": predicate.tokens, "arguments": [x.tokens for x in predicate.arguments], } result.append(structure) return result
def main(args): model = Model.load(args.model) if not model: raise ValueError("Invalid model: '%s'" % args.model) os.makedirs(args.out_dir, exist_ok=True) pipeline = Pipeline(model, "tokenize" if args.txt else "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") for pattern in args.filenames: for in_file in glob(pattern) or [pattern]: basename = os.path.basename(in_file) out_file = os.path.join(args.out_dir, os.path.splitext(basename)[0] + ".conllu") error = ProcessingError() with open(in_file, encoding="utf-8") as f: processed = pipeline.process(f.read(), error) if error.occurred(): raise RuntimeError(error.message) with open(out_file, "w", encoding="utf-8") as f: f.write(processed) if not args.quiet: print("Wrote '%s'" % out_file)
def load_dictionaries(self, data_folder, models_folder): self.lemmatizer.load() # Общий словарь для генеративных грамматик #self.gg_dictionaries.load(os.path.join(models_folder, 'generative_grammar_dictionaries.bin')) #word2lemmas_path = os.path.join(data_folder, 'ru_word2lemma.tsv.gz') #self.lexicon.load(word2lemmas_path) #word2tags_path = os.path.join(data_folder, 'chatbot_word2tags.dat') #self.postagger.load(word2tags_path) self.postagger.load() self.word2tags.load() self.flexer.load() self.chunker.load() # Грузим dependency parser UDPipe и русскоязычную модель model_file = os.path.join(models_folder, 'udpipe_syntagrus.model') self.udpipe_model = Model.load(model_file) self.udpipe_pipeline = Pipeline(self.udpipe_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') self.udpipe_error = ProcessingError()
def _runApp(self, dto, opNotDone): text = dto.getText() tokenizer = self._model.newTokenizer(self._model.DEFAULT) tokenizer.setText(text) error = ProcessingError() sentence = Sentence() sid = 0 while tokenizer.nextSentence(sentence, error): self._model.tag(sentence, self._model.DEFAULT) self._model.parse(sentence, self._model.DEFAULT) # Teprolin tokenized sentence ttsent = [] # Teprolin string sentence tssent = sentence.getText() for w in sentence.words: if w.id == 0: continue tt = TeproTok() tt.setId(w.id) tt.setWordForm(w.form) tt.setCTAG(w.upostag) tt.setMSD(w.xpostag) tt.setLemma(w.lemma) tt.setHead(w.head) tt.setDepRel(w.deprel) ttsent.append(tt) # end for w if not dto.isOpPerformed(TeproAlgo.getSentenceSplittingOperName()): dto.addSentenceString(tssent) dto.addSentenceTokens(ttsent) else: # Check and update annotations that only TTL # can produce or that are requested specifically from it. alignment = dto.alignSentences(ttsent, sid) for op in opNotDone: dto.copyTokenAnnotation(ttsent, sid, alignment, op) sentence = Sentence() sid += 1 # end all split sentences. return dto
def get_tokens(input, lang="en"): from io import StringIO from ufal.udpipe import ProcessingError error = ProcessingError() pipeline = Understanding._get_udpipe_pipeline(lang) processed = pipeline.process(input, error) if error.occurred(): raise Exception(error.message) sio = StringIO(processed) # pandas frame is a bit overkill I guess # import pandas as pd # first three rows has comments # df = pd.read_csv(sio, sep="\t", skiprows=3) # return df import csv rd = csv.reader(sio, delimiter="\t") nodes = [] for row in rd: if len(row) < 8: continue nodes.append(Token(row)) return nodes
def parse(sentence): sys.argv.append('tokenize') sys.argv.append('conllu') sys.argv.append('russian-syntagrus-ud-2.4-190531.udpipe') model = Model.load(sys.argv[3]) pipeline = Pipeline(model, sys.argv[1], Pipeline.DEFAULT, Pipeline.DEFAULT, sys.argv[2]) error = ProcessingError() # small preproccessing step sentence = re.sub('«', '« ', sentence) sentence = re.sub('»', '» ', sentence) parsed = pipeline.process(sentence, error) print(parsed) return parsed
def __init__( self, path_to_udpipe: str, resolve_relcl: bool = True, resolve_appos: bool = True, resolve_amod: bool = True, resolve_conj: bool = True, resolve_poss: bool = True, ud=dep_v2.VERSION, ): super().__init__() self.model = Model.load(path_to_udpipe) self.pipeline = Pipeline( self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu" ) self._error = ProcessingError() self._opts = PredPattOpts( resolve_relcl=resolve_relcl, resolve_appos=resolve_appos, resolve_amod=resolve_amod, resolve_conj=resolve_conj, resolve_poss=resolve_poss, ud=ud, )
def get_pipeline(): # Load model, handle errors global model sys.stderr.write('Loading model... ') model_filename = "./UDPipe-ud2-3/english-gum-ud-2.3.udpipe" model = Model.load(model_filename) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % model_filename) sys.exit(1) sys.stderr.write('Done.\n') # Create model pipeline pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() return pipeline, error
class PredPattArgumentExtractor(ArgumentExtractor): def __init__( self, path_to_udpipe: str, resolve_relcl: bool = True, resolve_appos: bool = True, resolve_amod: bool = True, resolve_conj: bool = True, resolve_poss: bool = True, ud=dep_v2.VERSION, ): super().__init__() self.model = Model.load(path_to_udpipe) self.pipeline = Pipeline( self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu" ) self._error = ProcessingError() self._opts = PredPattOpts( resolve_relcl=resolve_relcl, resolve_appos=resolve_appos, resolve_amod=resolve_amod, resolve_conj=resolve_conj, resolve_poss=resolve_poss, ud=ud, ) @lru_cache(maxsize=100000) def extract(self, sentence: str) -> List[Dict[str, Any]]: processed = self.pipeline.process(sentence, self._error) if self._error.occurred(): print(f"=== Error occurred: {self._error.message}") self._error = ProcessingError() return None else: conll_example = [ud_parse for sent_id, ud_parse in load_conllu(processed)][ 0 ] ppatt = PredPatt(conll_example, opts=self._opts) result = [] for predicate in ppatt.instances: structure = { "predicate": predicate.tokens, "arguments": [x.tokens for x in predicate.arguments], } result.append(structure) return result
class UDParser(object): models = { "en": "UniversalPetrarch/preprocessing/udpipe-1.2.0/model/english-ud-2.0-170801.udpipe", "es": "UniversalPetrarch/preprocessing/udpipe-1.2.0/model/spanish-ancora-ud-2.0-170801.udpipe", "ar": "" } pipeline = None error = ProcessingError() model = None def __init__(self, lang="en"): model_file = "/Users/sxs149331/PycharmProjects/UniversalPetrarch-master/" + self.models[ lang] print model_file self.model = Model.load(model_file) if not self.model: sys.stderr.write("Model Loading Failed") sys.exit(1) sys.stderr.write('done\n') self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") def parse(self, text): #print self.pipeline processed = self.pipeline.process(text.strip(), self.error) if self.error.occurred(): raise ValueError(self.error.message) lines = processed.split("\n") result = [] for line in lines: if line.startswith("#"): continue result.append(line) return ("\n").join(result)
def preproc_item(text): if pd.isna(text): text = '' tokenizer.resetDocument() try: tokenizer.setText(text) except TypeError: print(row, text) 1/0 sentence = Sentence() error = ProcessingError() text = '' while (tokenizer.nextSentence(sentence, error)): udpipe_model.tag(sentence, Pipeline.DEFAULT, error) #udpipe_model.parse(sentence, Pipeline.DEFAULT, error) text += OutputFormat.newConlluOutputFormat().writeSentence(sentence) return text
def transform_item(self, x): return self.parser_pipeline.process(x, ProcessingError())
class UDPipe: """Wrapper for UDPipe (more pythonic than ufal.udpipe).""" def __init__(self, model): """Create the UDPipe tool object.""" self.model = model path = require_file(model) self.tool = Model.load(path) if not self.tool: raise IOError("Cannot load model from file '%s'" % path) self.error = ProcessingError() self.conllu_reader = ConlluReader() self.tokenizer = self.tool.newTokenizer(Model.DEFAULT) def tag_parse_tree(self, root): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" descendants = root.descendants if not descendants: return pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') in_data = " ".join([n.form for n in descendants]) out_data = pipeline.process(in_data, self.error) if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() nodes = [root] + descendants for parsed_node in parsed_root.descendants: node = nodes[parsed_node.ord] node.parent = nodes[parsed_node.parent.ord] for attr in 'upos xpos lemma feats'.split(): setattr(node, attr, getattr(parsed_node, attr)) # TODO: benchmark which solution is the fastest one. E.g. we could also do # for node, parsed_node in zip(root.descendants, parsed_root.descendants): # parsed_node.misc = node.misc # pylint: disable=protected-access #root._children, root._descendants = parsed_root._children, parsed_root._descendants def tokenize_tag_parse_tree(self, root): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.""" if root.children: raise ValueError('Tree already contained nodes before tokenization') # tokenization (I cannot turn off segmenter, so I need to join the segments) self.tokenizer.setText(root.text) u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) u_words = u_sentence.words n_words = u_words.size() - 1 if is_another: u_sent_cont = Sentence() while self.tokenizer.nextSentence(u_sent_cont): n_cont = u_sent_cont.words.size() - 1 for i in range(1, n_cont + 1): u_w = u_sent_cont.words[i] n_words += 1 u_w.id = n_words u_words.append(u_w) # tagging and parsing self.tool.tag(u_sentence, Model.DEFAULT) self.tool.parse(u_sentence, Model.DEFAULT) # converting UDPipe nodes to Udapi nodes heads, nodes = [], [root] for i in range(1, u_words.size()): u_w = u_words[i] node = root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, ) node.misc = u_w.misc heads.append(u_w.head) nodes.append(node) for node in nodes[1:]: head = heads.pop(0) node.parent = nodes[head]
# Install this. # !pip install ufal.udpipe # Load this from ufal.udpipe import Model, Pipeline, ProcessingError # Download model: https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2898 # Load this model = Model.load('hungarian-szeged-ud-2.3-181115.udpipe') pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') # Full pipeline2 = Pipeline(model, 'vertical', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') # No tokenization, Vertical format as input... pipeline3 = Pipeline( model, 'vertical', Pipeline.DEFAULT, Pipeline.NONE, 'conllu') # No tokenization & No Parsing = Just POS tagging pipeline4 = Pipeline(model, 'conllu', Pipeline.NONE, Pipeline.DEFAULT, 'conllu') # Just Parsing... # Remark: pipeline4 runs even when no POS taging supplied... Maybe it do POS tagging in background... error = ProcessingError() # For catching errors... # Do the processing... processed = pipeline.process('Az alma szép piros volt.', error) assert error.occurred(), 'Error happened check documentation!' # Write the output in CoNLL-U print(processed)
class UDPipe: """Wrapper for UDPipe (more pythonic than ufal.udpipe).""" def __init__(self, model): """Create the UDPipe tool object.""" self.model = model path = require_file(model) self.tool = Model.load(path) if not self.tool: raise IOError("Cannot load model from file '%s'" % path) self.error = ProcessingError() self.conllu_reader = ConlluReader() self.tokenizer = self.tool.newTokenizer(Model.DEFAULT) def tag_parse_tree(self, root): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" descendants = root.descendants if not descendants: return pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') in_data = " ".join([n.form for n in descendants]) out_data = pipeline.process(in_data, self.error) if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() nodes = [root] + descendants for parsed_node in parsed_root.descendants: node = nodes[parsed_node.ord] node.parent = nodes[parsed_node.parent.ord] for attr in 'upos xpos lemma feats deprel'.split(): setattr(node, attr, getattr(parsed_node, attr)) # TODO: benchmark which solution is the fastest one. E.g. we could also do # for node, parsed_node in zip(root.descendants, parsed_root.descendants): # parsed_node.misc = node.misc # pylint: disable=protected-access #root._children, root._descendants = parsed_root._children, parsed_root._descendants def tokenize_tag_parse_tree(self, root, resegment=False, tag=True, parse=True): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`. If resegment=True, the returned list of Udapi trees may contain multiple trees. """ if root.children: raise ValueError( 'Tree already contained nodes before tokenization') # Tokenize and segment the text (segmentation cannot be turned off in older UDPipe versions). self.tokenizer.setText(root.text) is_another = True u_sentences = [] while is_another: u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) if is_another: u_sentences.append(u_sentence) # If resegmentation was not required, we need to join the segments. if not resegment and len(u_sentences) > 1: first_sent = u_sentences[0] n_words = first_sent.words.size() - 1 for other_sent in u_sentences[1:]: other_words = other_sent.words.size() - 1 for i in range(1, other_words + 1): u_w = other_sent.words[i] n_words += 1 u_w.id = n_words first_sent.words.append(u_w) u_sentences = [first_sent] # tagging and parsing if tag: for u_sentence in u_sentences: self.tool.tag(u_sentence, Model.DEFAULT) if parse: self.tool.parse(u_sentence, Model.DEFAULT) elif parse: raise ValueError( 'Combination parse=True tag=False is not allowed.') # converting UDPipe nodes to Udapi nodes new_root = root trees = [] for u_sentence in u_sentences: if not new_root: new_root = Root() heads, nodes = [], [new_root] u_words = u_sentence.words for i in range(1, u_words.size()): u_w = u_words[i] node = new_root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, misc=u_w.misc, ) if parse: heads.append(u_w.head) nodes.append(node) if parse: for node in nodes[1:]: head = heads.pop(0) node.parent = nodes[head] trees.append(new_root) new_root = None return trees
class TextUtils(object): def __init__(self): self.clause_splitter = rutokenizer.Segmenter() self.tokenizer = Tokenizer() self.tokenizer.load() #self.lexicon = Word2Lemmas() self.language_resources = LanguageResources() self.postagger = rupostagger.RuPosTagger() self.chunker = ruchunker.Chunker() self.word2tags = ruword2tags.RuWord2Tags() self.flexer = ruword2tags.RuFlexer() self.syntan = None self.gg_dictionaries = GenerativeGrammarDictionaries() #self.known_words = set() #self.lemmatizer = Mystem() self.lemmatizer = rulemma.Lemmatizer() self.word_embeddings = None def load_embeddings(self, w2v_dir, wc2v_dir): # Загрузка векторных словарей self.word_embeddings = WordEmbeddings() self.word_embeddings.load_models(w2v_dir, wc2v_dir) if wc2v_dir: p = os.path.join(wc2v_dir, 'wc2v.kv') self.word_embeddings.load_wc2v_model(p) p = os.path.join(w2v_dir, 'w2v.kv') self.word_embeddings.load_w2v_model(p) def load_dictionaries(self, data_folder, models_folder): self.lemmatizer.load() # Общий словарь для генеративных грамматик #self.gg_dictionaries.load(os.path.join(models_folder, 'generative_grammar_dictionaries.bin')) #word2lemmas_path = os.path.join(data_folder, 'ru_word2lemma.tsv.gz') #self.lexicon.load(word2lemmas_path) #word2tags_path = os.path.join(data_folder, 'chatbot_word2tags.dat') #self.postagger.load(word2tags_path) self.postagger.load() self.word2tags.load() self.flexer.load() self.chunker.load() # Грузим dependency parser UDPipe и русскоязычную модель model_file = os.path.join(models_folder, 'udpipe_syntagrus.model') self.udpipe_model = Model.load(model_file) self.udpipe_pipeline = Pipeline(self.udpipe_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') self.udpipe_error = ProcessingError() #self.syntan = rusyntax2.Tagger(self.word2tags, w2v, self.postagger) #self.syntan.load() #rules_path = os.path.join(data_folder, 'rules.yaml') #with io.open(rules_path, 'r', encoding='utf-8') as f: #data = yaml.safe_load(f) #self.no_info_replicas = data['no_relevant_information'] #self.unknown_order = data['unknown_order'] #self.language_resources.key2phrase[u'yes'] = data[u'answers'][u'yes'] #self.language_resources.key2phrase[u'not'] = data[u'answers'][u'not'] # Список "хороших слов" для генеративной грамматики #with io.open(os.path.join(models_folder, 'dataset_words.txt'), 'r', encoding='utf-8') as rdr: # for line in rdr: # word = line.strip() # self.known_words.add(word) def apply_word_function(self, func, constants, words): part_of_speech = None tag = None if func == '$chooseAdjByGender': part_of_speech = 'ПРИЛАГАТЕЛЬНОЕ' tag = ('РОД', constants['gender']) elif func == '$chooseVByGender': part_of_speech = 'ГЛАГОЛ' tag = ('РОД', constants['gender']) elif func == '$chooseNByGender': part_of_speech = 'СУЩЕСТВИТЕЛЬНОЕ' tag = ('РОД', constants['gender']) else: raise NotImplementedError() tag2 = tag[0] + '=' + tag[1] for word in words: #tagsets = self.gg_dictionaries.grdict.get_word_tagsets2(word.lower(), part_of_speech) for tagset in self.word2tags[word.lower()]: if part_of_speech in tagset and tag2 in tagset: return word msg = 'Could not choose a word among {}'.format(' '.join(words)) raise RuntimeError(msg) def tag(self, words, with_lemmas=False): """ Частеречная разметка для цепочки слов words """ if with_lemmas: return self.lemmatizer.lemmatize(self.postagger.tag(words)) else: return self.postagger.tag(words) def canonize_text(self, s): """ Удаляем два и более пробелов подряд, заменяя на один """ s = re.sub("(\\s{2,})", ' ', s.strip()) return s def remove_terminators(self, s): """ Убираем финальные пунктуаторы ! ? .""" return s[:-1].strip() if s[-1] in u'?!.' else s def wordize_text(self, s): return u' '.join(self.tokenize(s)) def ngrams(self, s, n): #return [u''.join(z) for z in itertools.izip(*[s[i:] for i in range(n)])] return [u''.join(z) for z in zip(*[s[i:] for i in range(n)])] def words2str(self, words): return u' '.join( itertools.chain([BEG_WORD], filter(lambda z: len(z) > 0, words), [END_WORD])) def split_clauses(self, s): return list(self.clause_splitter.split(s)) def tokenize(self, s): return self.tokenizer.tokenize(s) def extract_lemma(self, token): return token[0] if token[1] == 'PRON' else token[2] def lemmatize(self, s): words = self.tokenizer.tokenize(s) #wx = u' '.join(words) #return [l for l in self.lemmatizer.lemmatize(wx) if len(l.strip()) > 0] tokens = self.lemmatizer.lemmatize(self.postagger.tag(words)) return [self.extract_lemma(t) for t in tokens] def lemmatize2(self, s): words = self.tokenizer.tokenize(s) return self.lemmatizer.lemmatize(self.postagger.tag(words)) def lpad_wordseq(self, words, n): """ Слева добавляем пустые слова """ return list( itertools.chain(itertools.repeat(PAD_WORD, n - len(words)), words)) def rpad_wordseq(self, words, n): """ Справа добавляем пустые слова """ return list( itertools.chain(words, itertools.repeat(PAD_WORD, n - len(words)))) #def get_lexicon(self): # return self.lexicon def is_question_word(self, word): return word in u'насколько где кто что почему откуда куда зачем чего кого кем чем кому чему ком чем как сколько ли когда докуда какой какая какое какие какого какую каких каким какими какому какой каков какова каковы'.split( ) def build_output_phrase(self, words): s = u' '.join(words) s = s.replace(u' ?', u'?').replace(u' !', u'!').replace(u' ,', u',').replace(u' :', u',') \ .replace(u' .', u'.').replace(u'( ', u'(').replace(u' )', u')') s = s[0].upper() + s[1:] return s def detect_person0(self, words): if any((word in (u'ты', u'тебя', u'тебе')) for word in words): return 2 if any((word in (u'я', u'мне', u'меня')) for word in words): return 1 return -1 def extract_chunks(self, sample): tokens = self.tokenizer.tokenize(sample) tagsets = list(self.postagger.tag(tokens)) lemmas = self.lemmatizer.lemmatize(tagsets) #edges = syntan.parse(tokens, tagsets) phrase_tokens = [] for word_index, (token, tagset, lemma) in enumerate(zip(tokens, tagsets, lemmas)): t = PhraseToken() t.word = token t.norm_word = token.lower() t.lemma = lemma[2] t.tagset = tagset[1] t.word_index = word_index phrase_tokens.append(t) chunks = self.chunker.parse(tokens) for chunk_index, chunk in enumerate(chunks): phrase_tokens[chunk.tokens[0].index].is_chunk_starter = True for token in chunk.tokens: phrase_tokens[token.index].chunk_index = chunk_index return chunks def word_similarity(self, word1, word2): return self.word_embeddings.word_similarity(word1, word2) def parse_syntax(self, text_str): processed = self.udpipe_pipeline.process(text_str, self.udpipe_error) if self.udpipe_error.occurred(): logging.error("An error occurred when running run_udpipe: %s", self.udpipe_error.message) return None parsed_data = pyconll.load_from_string(processed)[0] return parsed_data def get_udpipe_attr(self, token, tag_name): if tag_name in token.feats: v = list(token.feats[tag_name])[0] return v return '' def change_verb_gender(self, verb_inf, new_gender): """ Изменение формы глагола в прошедшем времени единственном числе """ required_tags = [('ВРЕМЯ', 'ПРОШЕДШЕЕ'), ('ЧИСЛО', 'ЕД')] if new_gender == 'Fem': required_tags.append(('РОД', 'ЖЕН')) else: required_tags.append(('РОД', 'МУЖ')) forms = list(self.flexer.find_forms_by_tags(verb_inf, required_tags)) if forms: return forms[0] else: return None def change_adj_gender(self, adj_lemma, new_gender, variant): if adj_lemma == 'должен': if new_gender == 'Fem': return 'должна' else: return 'должен' required_tags = [('ЧИСЛО', 'ЕД')] if variant == 'Short': required_tags.append(('КРАТКИЙ', '1')) else: required_tags.append(('КРАТКИЙ', '0')) required_tags.append(('ПАДЕЖ', 'ИМ')) if new_gender == 'Fem': required_tags.append(('РОД', 'ЖЕН')) else: required_tags.append(('РОД', 'МУЖ')) forms = list(self.flexer.find_forms_by_tags(adj_lemma, required_tags)) if forms: return forms[0] else: return None def is_premise_suitable_as_answer(self, premise_text): # Можно ли текст предпосылки использовать в качестве ответа tx = self.tokenize(premise_text) if len(tx) > 5: return False if ',' in tx or 'и' in tx or 'или' in tx: return False return True
class UDPipe: """Wrapper for UDPipe (more pythonic than ufal.udpipe).""" def __init__(self, model): """Create the UDPipe tool object.""" self.model = model path = require_file(model) self.tool = Model.load(path) if not self.tool: raise IOError("Cannot load model from file '%s'" % path) self.error = ProcessingError() self.conllu_reader = ConlluReader() self.tokenizer = self.tool.newTokenizer(Model.DEFAULT) def tag_parse_tree(self, root): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') in_data = " ".join([n.form for n in root.descendants]) out_data = pipeline.process(in_data, self.error) if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() nodes = [root] + root.descendants for parsed_node in parsed_root.descendants: node = nodes[parsed_node.ord] node.parent = nodes[parsed_node.parent.ord] for attr in 'upos xpos lemma feats'.split(): setattr(node, attr, getattr(parsed_node, attr)) # TODO: benchmark which solution is the fastest one. E.g. we could also do # for node, parsed_node in zip(root.descendants, parsed_root.descendants): # parsed_node.misc = node.misc # pylint: disable=protected-access #root._children, root._descendants = parsed_root._children, parsed_root._descendants def tokenize_tag_parse_tree(self, root): """Tokenize, tag (+lemmatize, fill FEATS) and parse the text stored in `root.text`.""" if root.children: raise ValueError('Tree already contained nodes before tokenization') # tokenization (I cannot turn off segmenter, so I need to join the segments) self.tokenizer.setText(root.text) u_sentence = Sentence() is_another = self.tokenizer.nextSentence(u_sentence) u_words = u_sentence.words n_words = u_words.size() - 1 if is_another: u_sent_cont = Sentence() while self.tokenizer.nextSentence(u_sent_cont): n_cont = u_sent_cont.words.size() - 1 for i in range(1, n_cont + 1): u_w = u_sent_cont.words[i] n_words += 1 u_w.id = n_words u_words.append(u_w) # tagging and parsing self.tool.tag(u_sentence, Model.DEFAULT) self.tool.parse(u_sentence, Model.DEFAULT) # converting UDPipe nodes to Udapi nodes heads, nodes = [], [root] for i in range(1, u_words.size()): u_w = u_words[i] node = root.create_child( form=u_w.form, lemma=u_w.lemma, upos=u_w.upostag, xpos=u_w.xpostag, feats=u_w.feats, deprel=u_w.deprel, ) node.misc = u_w.misc heads.append(u_w.head) nodes.append(node) for node in nodes[1:]: head = heads.pop(0) node.parent = nodes[head]
class ProcessorUDPipe: """Wrapper around UDPipe - Trainable pipeline library. Performs: 1. Tokenization. 2. Tagging. 3. Lemmatizing. 4. Parsing. """ def __init__(self, model_path, tagger=True, parser=True, delay_init=False): self._model_path = model_path self._enable_tagger = tagger self._enable_parser = parser self.model = None if not delay_init: self.init() def init(self): if self.model is None: self.model = Model.load(self._model_path) if not self.model: sys.stderr.write('Cannot load model from file "%s"\n' % self._model_path) self.tagger = Pipeline.DEFAULT if self._enable_tagger else Pipeline.NONE self.parser = Pipeline.DEFAULT if self._enable_parser else Pipeline.NONE self.error = ProcessingError() self.converter_conll = ConverterConllUDV1() def __call__(self, *argv): """Performs tokenization, tagging, lemmatizing and parsing. Args: text(str): text. OR tokens(list): List of Token objects. sentences(list): List of Sentence objects. Returns: Dictionary that contains: 1. tokens - list of objects Token. 2. sentences - list of objects Sentence. 3. lemma - list of lists of strings that represent lemmas of words. 4. postag - list of lists of strings that represent postags of words. 5. morph - list of lists of strings that represent morphological features. 6. syntax_dep_tree - list of lists of objects WordSynt that represent a dependency tree. """ assert self.model if type(argv[0]) == str: self.TOKENIZER = 'generic_tokenizer' self.pipeline = Pipeline(self.model, self.TOKENIZER, self.tagger, self.parser, 'conllu') return self.process_text(argv[0]) self.TOKENIZER = 'horizontal' self.pipeline = Pipeline(self.model, self.TOKENIZER, self.tagger, self.parser, 'conllu') return self.process_tokenized(argv[0], argv[1]) def process_text(self, text): udpipe_result = self.pipeline.process(text, self.error) if self.error.occurred(): sys.stderr.write( 'An error occurred when calling ProcessorUDPipe: ') sys.stderr.write(self.error.message) return sys.stderr.write('\n') annotation = self.convert_conll(text, udpipe_result) return annotation def process_tokenized(self, tokens, sentences): raw_input = self.prepare_tokenized_input(tokens, sentences) annotation = self.process_text(raw_input) lemma_result = annotation['lemma'] postag_result = annotation['postag'] morph_result = annotation['morph'] synt_dep_tree_result = annotation['syntax_dep_tree'] return { 'lemma': lemma_result, 'postag': postag_result, 'morph': morph_result, 'syntax_dep_tree': synt_dep_tree_result } def prepare_tokenized_input(self, tokens, sentences): raw_input_s = '' for sent in sentences: line = ' '.join((e.text for e in CSentence(tokens, sent))) raw_input_s += line raw_input_s += '\n' return raw_input_s def convert_conll(self, text, udpipe_result): annotation = self.converter_conll(udpipe_result) if self.tagger == Pipeline.NONE: for key in ('lemma', 'postag'): annotation.pop(key, None) if self.parser == Pipeline.NONE: for key in ('syntax_dep_tree', 'postag'): annotation.pop(key, None) for sent_lemma in annotation['lemma']: for i in range(len(sent_lemma)): sent_lemma[i] = sent_lemma[i].lower() annotation['sentences'] = self.converter_conll.sentence_split( annotation['form']) annotation['tokens'] = self.converter_conll.get_tokens( text, annotation['form']) return annotation
class Omorfi: """ An object holding omorfi binariesfor all the functions of omorfi. The following functionalities use automata binaries that need to be loaded separately: * analysis * tokenisation * generation * lemmatisation * segmentation * lookup * guess There is python code to perform basic string munging controlled by following bool attributes: try_lowercase: to use `str.lower()` try_titlecase: to use `str[0].upper() + str[1:]` try_uppercase: to use `str.upper()` try_detitlecase: to use `str[0].lower + str[1:]` The annotations will be changed when transformation has been applied. """ ## magic number for penalty weights _penalty = 28021984 ## paths to search auto-detected models from _stdpaths = [ '/usr/local/share/hfst/fi/', '/usr/share/hfst/fi/', '/usr/local/share/omorfi/', '/usr/share/omorfi/', './', 'generated/', 'src/generated/', '../src/generated/' ] def __init__(self, verbosity=False): """Construct Omorfi with given verbosity for printouts.""" self._verbosity = verbosity ## analyser model self.analyser = None ## tokeniser self.tokeniser = None ## generator model self.generator = None ## lemmatising model self.lemmatiser = None ## hyphenating model self.hyphenator = None ## segmenting model self.segmenter = None ## label-segment model self.labelsegmenter = None ## acceptor self.acceptor = None ## guesser model self.guesser = None ## UDPipe model self.udpiper = None ## UDPipeline object :-( self.udpipeline = None ## UDError object :-( self.uderror = None ## database of lexical unigram probabilities self.lexlogprobs = dict() ## database of tag unigram probabilities self.taglogprobs = dict() ## whether to lowercase and re-analyse if needed self.try_lowercase = True ## whether to Titlecase and re-analyse if needed self.try_titlecase = True ## whether to dEtitlecase and re-analyse if needed self.try_detitlecase = True ## whether to UPPERCASE and re-analyse if needed self.try_uppercase = False ## whether accept model is loaded self.can_accept = False ## whether analyser model is loaded self.can_analyse = False ## whether tokenisr model is loaded self.can_tokenise = True ## whether generator model is loaded self.can_generate = False ## whether lemmatising model is loaded self.can_lemmatise = False ## whether hypenation model is loaded self.can_hyphenate = False ## whether segmentation model is loaded self.can_segment = False ## whether label segmentation model is loaded self.can_labelsegment = False ## whether guesser model is loaded self.can_guess = False ## whether UDPipe is loaded self.can_udpipe = False def load_hfst(self, f): """Load an automaton from file. @param f containing single hfst automaton binary. """ try: his = libhfst.HfstInputStream(f) return his.read() except libhfst.NotTransducerStreamException: raise IOError def load_labelsegmenter(self, f): """Load labeled segments model from a file. @param f containing single hfst automaton binary. @sa load_hfst(self, f) """ self.labelsegmenter = self.load_hfst(f) self.can_labelsegment = True def load_segmenter(self, f): """Load segmentation model from a file. @param f containing single hfst automaton binary. @sa load_hfst(self, f) """ self.segmenter = self.load_hfst(f) self.can_segment = True def load_analyser(self, f): """Load analysis model from a file. @param f containing single hfst automaton binary. @sa load_hfst(self, f) """ self.analyser = self.load_hfst(f) self.can_analyse = True self.can_accept = True self.can_lemmatise = True def load_generator(self, f): """Load generation model from a file. @param f containing single hfst automaton binary. @sa load_hfst(self, f) """ self.generator = self.load_hfst(f) self.can_generate = True def load_acceptor(self, f): """Load acceptor model from a file. @param f containing single hfst automaton binary. @sa load_hfst(self, f) """ self.acceptor = self.load_hfst(f) self.can_accept = True def load_tokeniser(self, f): """Load tokeniser model from a file. @param f containing single hfst automaton binary. @sa load_hfst(self, f) """ self.tokeniser = self.load_hfst(f) self.can_tokenise = True def load_lemmatiser(self, f): """Load lemmatiser model from a file. @param f containing single hfst automaton binary. @sa load_hfst(self, f) """ self.tokeniser = self.load_hfst(f) self.can_lemmatise = True def load_hyphenator(self, f): """Load hyphenator model from a file. @param f containing single hfst automaton binary. @sa load_hfst(self, f) """ self.hyphenator = self.load_hfst(f) self.can_hyphenate = True def load_guesser(self, f): """Load guesser model from a file. @param f containing single hfst automaton binary. @sa load_hfst(self, f) """ self.guesser = self.load_hfst(f) self.can_guess = True def load_filename(self, path, **include): """Load omorfi automaton from filename and guess its use. A file name should consist of three parts separated by full stop. The second part must be a keyword describing the use of the automaton, first part is parsed as an identifier typically starting with the word omorfi, followed by any extras, such as the tagset for analysis or generation. The named arguments can include a name of automaton type as name, and truth value as value, for types of automata allowed to load. By default, the names `analyse`, `generate` and `segment` are loaded. Names not included are defaulted to False. E.g., `omorfi.load_filename(fn, analyse=True)` will only load file named fn if it can be identified as omorfi analyser. This is best used in conjunction with omorfi.load_from_dir. """ if len(include) == 0: include['analyse'] = True include['generate'] = True include['segment'] = True include['accept'] = True for ttype in [ 'analyse', 'generate', 'accept', 'tokenise', 'lemmatise', 'hyphenate', 'segment', 'labelsegment', 'guesser', 'udpipe' ]: if ttype not in include: include[ttype] = False parts = path[path.rfind('/') + 1:path.rfind('.')].split('.') if len(parts) != 2: if self._verbosity: print('not loaded', path) elif not parts[0] == 'omorfi': if self._verbosity: print('not omorfi', path) elif parts[1] == 'analyse' and include['analyse']: if self._verbosity: print('analyser', parts[0]) self.load_analyser(path) elif parts[1] == 'generate' and include['generate']: if self._verbosity: print('generator', parts[0]) self.load_generator(path) elif parts[1] == 'accept' and include['accept']: if self._verbosity: print('acceptor', parts[0]) self.load_acceptor(path) elif parts[1] == 'tokenise' and include['tokenise']: if self._verbosity: print('tokeniser', parts[0]) self.load_tokeniser(path) elif parts[1] == 'lemmatise' and include['lemmatise']: if self._verbosity: print('lemmatiser', parts[0]) self.load_lemmatiser(path) elif parts[1] == 'hyphenate' and include['hyphenate']: if self._verbosity: print('hyphenator', parts[0]) self.load_hyphenator(path) elif parts[1] == 'segment' and include['segment']: if self._verbosity: print('segmenter', parts[0]) self.load_segmenter(path) elif parts[1] == 'guesser' and include['guesser']: if self._verbosity: print('guesser', parts[0]) self.load_guesser(path) elif parts[1] == 'labelsegment' and include['labelsegment']: if self._verbosity: print('labelsegmenter', parts[0]) self.load_labelsegmenter(path) elif self._verbosity: print('skipped', parts) def _maybe_str2token(self, s): if isinstance(s, str): return {"surf": s} elif isinstance(s, dict): return s else: return { "error": "not a string or dict", "location": "maybe_str2token", "data": s } def load_from_dir(self, path=None, **include): """Load omorfi automata from given or known locations. If path is given it should point to directory of automata, otherwise standard installation paths are tried. Currently standard linux install paths are all globbed in following order: * /usr/local/share/hfst/fi/*.hfst * /usr/share/hfst/fi/*.hfst * /usr/local/share/omorfi/*.hfst * /usr/share/omorfi/*.hfst * getenv('HOME') + /.hfst/fi/*.hfst * getenv('HOME') + /.omorfi/*.hfst Last two paths require getenv('HOME'). All automata matching glob *.hfst are loaded and stored in part of omorfi class appropriate for their usage. They keyword args can be used to limit loading of automata. The name is analyser type and value is True. """ homepaths = [] if getenv('HOME'): home = getenv('HOME') homepaths = [home + '/.hfst/fi/', home + '/.omorfi/'] loadable = [] if path: if self._verbosity: print('adding', path + '/*.hfst') loadable = glob(path + '/*.hfst') else: for sp in self._stdpaths + homepaths: if self._verbosity: print('adding', sp + '/*.hfst') loadable += glob(sp + '/*.hfst') for filename in loadable: try: self.load_filename(filename, **include) except: print("broken HFST", filename, file=stderr) def load_udpipe(self, filename): """Load UDPipe model for statistical parsing. UDPipe can be used as extra information source for OOV symbols or all tokens. It works best with sentence-based analysis, token based does not keep track of context. @param filename path to UDPipe model """ if not can_udpipe: print("importing udpipe failed, cannot load udpipe xxx") return self.udpiper = Model.load(filename) # use pipeline for now, ugly but workable self.udpipeline = Pipeline(self.udpiper, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') self.uderror = ProcessingError() ## udpipe is loaded self.can_udpipe = True def load_lexical_frequencies(self, lexfile): """Load a frequency list for lemmas. Experimental. Currently in uniq -c format, subject to change. @param filename path to file with frequencies. """ lextotal = 0 lexcounts = dict() for line in lexfile: fields = line.split('\t') lexcount = int(fields[0]) lexcounts[fields[1]] = lexcount lextotal += lexcount for lex, freq in lexcounts.items(): if freq != 0: self.lexlogprobs[lex] = log(freq / lextotal) else: # XXX: hack hack, should just use LM count stuff with # discounts self.lexlogprobs[lex] = log(1 / (lextotal + 1)) def load_omortag_frequencies(self, omorfile): """Load a frequenc list for tags. Experimental. Currently in uniq -c format. Subject to change. @param filename path to file with frequencies. """ omortotal = 0 omorcounts = dict() for line in omorfile: fields = line.split('\t') omorcount = int(fields[0]) omorcounts[fields[1]] = omorcount omortotal += omorcount for omor, freq in omorcounts.items(): if freq != 0: self.taglogprobs[omor] = log(freq / omortotal) else: # XXX: hack hack, should just use LM count stuff with # discounts self.taglogprobs[omor] = log(1 / (omortotal + 1)) def _find_retoken_recase(self, token): """Turns a string into a recased non-OOV token.""" if self.accept(token): return {"surf": token, "analsurf": token, "recase": "ORIGINALCASE"} if self.try_lowercase and self.accept(token.lower()): return { "surf": token, "analsurf": token.lower(), "recase": "LOWERCASED" } if self.try_uppercase and self.accept(token.upper()): return { "surf": token, analsurf: "token.upper()", "recase": "UPPERCASED" } if len(token) > 1: if self.try_titlecase and \ self.accept(token[0].upper() + token[1:].lower()): return { "surf": token, "analsurf": token[0].upper() + token[1:].lower(), "recase": "TITLECASED" } if self.try_detitlecase and \ self.accept(token[0].lower() + token[1:]): return { "surf": token, "analsurf": token[0].lower() + token[1:], "recase": "DETITLECASED" } return False def _find_retokens(self, token): """Turns a string into a list of likely tokens.""" retoken = self._find_retoken_recase(token) if retoken: return [retoken] # Word. if token[-1] in fin_punct_trailing: retoken = self._find_retoken_recase(token[:-1]) if retoken: retoken['SpaceAfter'] = "No" return [retoken, {"surf": token[-1], "SpaceBefore": "No"}] # -Word if token[0] in fin_punct_leading: retoken = self._find_retoken_recase(token[1:]) if retoken: retoken['SpaceBefore'] = 'No' return [{"surf": token[0], "SpaceAfter": "No"}, retoken] # "Word" if token[0] in fin_punct_leading and token[-1] in fin_punct_trailing: retoken = self._find_retoken_recase(token[1:-1]) if retoken: retoken['SpaceBefore'] = 'No' retoken['SpaceAfter'] = 'No' return [{ "surf": token[0], "SpaceAfter": "No" }, retoken, { "surf": token[-1], "SpaceBefore": "No" }] # word." or word", if len(token) > 2 and token[-1] in fin_punct_trailing and token[ -2] in fin_punct_trailing: retoken = self._find_retoken_recase(token[:-2]) if retoken: retoken["SpaceAfter"] = "No" return [ retoken, { "surf": token[-2], "SpaceBefore": "No", "SpaceAfter": "No" }, { "surf": token[-1], "SpaceBefore": "No" } ] # word.", if len(token) > 3 and token[-1] in fin_punct_trailing and token[ -2] in fin_punct_trailing and token[-3] in fin_punct_trailing: retoken = self._find_retoken_recase(token[:-3]) if retoken: retoken["SpaceAfter"] = "No" return [ retoken, { "surf": token[-3], "SpaceBefore": "No", "SpaceAfter": "No" }, { "surf": token[-2], "SpaceBefore": "No", "SpaceAfter": "No" }, { "surf": token[-1], "SpaceBefore": "No" } ] # "word." if len(token) > 3 and token[-1] in fin_punct_trailing and token[ -2] in fin_punct_trailing and token[0] in fin_punct_leading: retoken = self._find_retoken_recase(token[1:-2]) if retoken: retoken["SpaceAfter"] = "No" retoken["SpaceBefore"] = "No" return [{ "surf": token[0], "SpaceAfter": "No" }, retoken, { "surf": token[-2], "SpaceBefore": "No", "SpaceAfter": "No" }, { "surf": token[-1], "SpaceBefore": "No" }] # "word.", if len(token) > 4 and token[-1] in fin_punct_trailing and token[ -2] in fin_punct_trailing and token[ -3] in fin_punct_trailing and token[0] in fin_punct_leading: retoken = self._find_retoken_recase(token[1:-3]) if retoken: retoken["SpaceAfter"] = "No" retoken["SpaceBefore"] = "No" return [{ "surf": token[0], "SpaceAfter": "No" }, retoken, { "surf": token[-3], "SpaceBefore": "No", "SpaceAfter": "No" }, { "surf": token[-2], "SpaceBefore": "No", "SpaceAfter": "No" }, { "surf": token[-1], "SpaceBefore": "No" }] # ...non-word... pretokens = [] posttokens = [] while len(token) > 1 and token[-1] in fin_punct_trailing: posttokens = ([{ "surf": token[-1], "SpaceBefore": "No" }] + posttokens) token = token[:-1] while len(token) > 1 and token[0] in fin_punct_leading: pretokens += [{"surf": token[0], "SpaceAfter": "No"}] token = token[1:] lastresort = {"surf": token} if len(pretokens) > 0: lastresort['SpaceBefore'] = 'No' if len(posttokens) > 0: lastresort['SpaceAfter'] = 'No' return pretokens + [lastresort] + posttokens def _retokenise(self, tokens): """Takes list of string from and produces list of tokens. May change number of tokens. Should be used with result of split(). """ retokens = [] for token in tokens: for retoken in self._find_retokens(token): retokens.append(retoken) return retokens def fsa_tokenise(self, line): """Tokenise with FSA. @param line string to tokenise """ return None def python_tokenise(self, line): """Tokenise with python's basic string functions. @param line string to tokenise """ return self._retokenise(line.split()) def tokenise(self, line): """Perform tokenisation with loaded tokeniser if any, or `split()`. If tokeniser is available, it is applied to input line and if result is achieved, it is split to tokens according to tokenisation strategy and returned as a list. If no tokeniser are present, or none give results, the line will be tokenised using python's basic string functions. If analyser is present, tokeniser will try harder to get some analyses for each token using hard-coded list of extra splits. """ tokens = None if self.tokeniser: tokens = self.fsa_tokenise(line) if not tokens: tokens = self.python_tokenise(line) return tokens def _analyse_str(self, s): """Legacy function if you really need to analyse a string. Turn it into a token and analyse a token instead. This is not even the standard string mangling. Do not touch.""" token = {"surf": s} res = self._analyse_token(token) if len(s) > 2 and s[0].islower() and self.try_titlecase: tcs = s[0].upper() + s[1:].lower() if s != tcs: tctoken = {"surf": s, "analsurf": tcs, "recase": 'Titlecased'} tcres = self._analyse_token(tctoken) for r in tcres: r['anal'] += '[CASECHANGE=TITLECASED]' res = res + tcres if len(token) > 2 and s[0].isupper() and self.try_detitlecase: dts = s[0].lower() + s[1:] if dts != s: dttoken = { "surf": s, "analsurf": dts, "recase": "dETITLECASED" } dtres = self._analyse_token(dttoken) for r in dtres: r['anal'] += '[CASECHANGE=DETITLECASED]' res = res + dtres if not s.isupper() and self.try_uppercase: ups = s.upper() if s != ups: uptoken = {"surf": s, "analsurf": ups, "recase": "UPPERCASED"} upres = self._analyse_token(uptoken) for r in upres: r['anal'] += '[CASECHANGE=UPPERCASED]' res = res + upres if not s.islower() and self.try_lowercase: lows = s.lower() if s != lows: lowtoken = { "surf": s, "analsurf": lows, "recase": "lowercased" } lowres = self._analyse_token(lowtoken) for r in lowres: r['anal'] += '[CASECHANGE=LOWERCASED]' res += lowres return res def _analyse_token(self, token): rv = [] if "analsurf_override" in token: # begin of sentence, etc. recasing extra res = self.analyser.lookup(token["analsurf_override"]) for r in res: rvtoken = token.copy() rvtoken['anal'] = r[0] + '[WEIGHT=%f]' % (r[1]) rvtoken['weight'] = r[1] rv.append(rvtoken) if "analsurf" in token: # surface from already determined res = self.analyser.lookup(token["analsurf"]) for r in res: rvtoken = token.copy() rvtoken['anal'] = r[0] + '[WEIGHT=%f]' % (r[1]) rvtoken["weight"] = r[1] rv.append(rvtoken) else: # use real surface case res = self.analyser.lookup(token["surf"]) for r in res: rvtoken = token.copy() rvtoken['analsurf'] = token["surf"] rvtoken['anal'] = r[0] + '[WEIGHT=%f]' % (r[1]) rvtoken["weight"] = r[1] rv.append(rvtoken) if not "analsurf_override" in token and not "analsurf" in token: # also guess other cases s = token['surf'] if len(s) > 2 and s[0].islower() and self.try_titlecase: tcs = s[0].upper() + s[1:].lower() if tcs != s: tcres = self.analyser.lookup(tcs) for r in tcres: tctoken = token.copy() tctoken['recase'] = 'Titlecased' tctoken['analsurf'] = tcs tctoken['anal'] = r[0] + \ '[CASECHANGE=TITLECASED][WEIGHT=%f]' % (r[1] + self._penalty) tctoken["weight"] = r[1] + self._penalty rv.append(tctoken) if len(token) > 2 and s[0].isupper() and self.try_detitlecase: dts = s[0].lower() + s[1:] if dts != s: dtres = self.analyser.lookup(dts) for r in dtres: dttoken = token.copy() dttoken['recase'] = 'dETITLECASED' dttoken['analsurf'] = dts dttoken['anal'] = r[0] + \ "[CASECHANGE=DETITLECASED][WEIGHT=%f]" % (r[1] + self._penalty) dttoken["weight"] = r[1] + self._penalty rv.append(dttoken) if not s.isupper() and self.try_uppercase: ups = s.upper() if ups != s: upres = self.analyser.lookup(ups) for r in upres: uptoken = token.copy() uptoken['recase'] = 'UPPERCASED' uptoken['analsurf'] = ups uptoken['anal'] = r[0] + \ "[CASECHANGE=UPPERCASED][WEIGHT=%f]" % (r[1] + self._penalty) uptoken["weight"] = r[1] + self._penalty rv.append(uptoken) if not s.islower() and self.try_lowercase: lows = s.lower() if lows != s: lowres = self.analyser.lookup(lows) for r in lowres: lowtoken = token.copy() lowtoken['recase'] = 'lowercased' lowtoken['analsurf'] = lows lowtoken['anal'] = r[0] +\ "[CASECHANGE=LOWERCASED][WEIGHT=%f]" %(r[1] + self._penalty) lowtoken["weight"] = r[1] + self._penalty rv.append(lowtoken) return rv def analyse(self, token): """Perform a simple morphological analysis lookup. If try_titlecase does not evaluate to False, the analysis will also be performed with first letter uppercased and rest lowercased. If try_uppercase evaluates to not False, the analysis will also be performed on all uppercase variant. If try_lowercase evaluates to not False, the analysis will also be performed on all lowercase variant. The analyses with case mangling will have an additional element to them identifying the casing. """ anals = None if isinstance(token, str): anals = self._analyse_str(token) elif isinstance(token, dict): anals = self._analyse_token(token) else: anals = [{ "error": "token is not str or dict", "token": token, "location": "analyse()" }] if not anals: if isinstance(token, str): anal = {"anal": '[WORD_ID=%s][GUESS=UNKNOWN][WEIGHT=inf]' \ % (token), "weight": float('inf'), "OOV": "Yes", "guess": "None"} elif isinstance(token, dict): anal = token.copy() anal["anal"] = '[WORD_ID=%s][GUESS=UNKNOWN][WEIGHT=inf]' \ % (token['surf']) anal["weight"] = float('inf') anal["OOV"] = "Yes" anal["guess"] = "None" else: anal = { "error": "token is not str or dict", "token": token, "location": "analyse()" } anals = [anal] return anals def analyse_sentence(self, s): """Analyse a full sentence with tokenisation and guessing. for details of tokenisation, see @c tokenise(self, s). for details of analysis, see @c analyse(self, token). If further models like udpipe are loaded, may fill in gaps with that. """ tokens = self.tokenise(s) if not tokens: tokens = [{ "error": "cannot tokenise sentence", "sentence": s, "location": "analyse_sentence" }] analysis_lists = [] i = 0 for token in tokens: i += 1 analysis_lists[i] += [self.analyse(token)] if self.can_udpipe: # N.B: I used the vertical input here udinput = '\n'.join([token["surf"] for token in tokens]) uds = self.udpipe(udinput) if len(uds) == len(analysis_lists): for i in range(len(uds)): analsysis_lists[i] += [uds[i]] return None def _guess_str(self, s): token = {"surf": s} return self._guess_token(token) def _guess_token(self, token): res = self.guesser.lookup(token['surf']) guesses = [] for r in res: guesstoken = token.copy() guesstoken['anal'] = r[0] + '[GUESS=FSA][WEIGHT=%f]' % (r[1]) guesstoken['weight'] = float(r[1]) guesstoken['guess'] = 'FSA' guesses += [guesstoken] return guesses def _guess_heuristic(self, token): '''Heuristic guessing function written fully in python. This should always be the most simple basic backoff, e.g. noun singular nominative for everything. ''' guesstoken = token.copy() # woo advanced heuristics!! if token['surf'][0].isupper() and len(token['surf']) > 1: guesstoken['anal'] = '[WORD_ID=' + token['surf'] +\ "][UPOS=PROPN][NUM=SG][CASE=NOM][GUESS=HEUR]" +\ "[WEIGHT=%f]" %( self._penalty ) guesstoken['weight'] = self._penalty guesstoken['guess'] = 'PYTHON0ISUPPER' else: guesstoken['anal'] = '[WORD_ID=' + token['surf'] +\ "][UPOS=NOUN][NUM=SG][CASE=NOM][GUESS=HEUR]" +\ "[WEIGHT=%f]" %( self._penalty ) guesstoken['weight'] = self._penalty guesstoken['guess'] = 'PYTHONELSE' return [guesstoken] def guess(self, token): '''Speculate morphological analyses of OOV token. This method may use multiple information sources, but not the actual analyser. Therefore a typical use of this is after the analyse(token) function has failed. Note that some information sources perform badly when guessing without context, for these the analyse_sentence(sent) is the only option. ''' realtoken = self._maybe_str2token(token) guesses = self._guess_heuristic(realtoken) if self.can_udpipe: guesses += [self._udpipe(realtoken['surf'])] if self.can_guess: guesses += self._guess_token(realtoken) return guesses def _lemmatise(self, token): res = self.lemmatiser.lookup(token['surf']) lemmas = [] for r in res: lemmatoken = token.copy() lemmatoken['lemma'] = r[0] lemmatoken['lemmaweight'] = float(r[1]) lemmas += [lemmatoken] return lemmas def lemmatise(self, token): '''Lemmatise a token, returning a dictionary ID. Like morphological analysis, can return more than one results, which are possible (combinations of) lexeme ids. If the token is not in the dictionary, the surface form is returned as most likely "lemma". ''' realtoken = self._maybe_str2token(token) lemmas = None lemmas = self._lemmatise(realtoken) if not lemmas or len(lemmas) < 1: lemmatoken = realtoken.copy() lemmatoken['lemma'] = lemmatoken['surf'] lemmatoken['lemmaweight'] = float('inf') lemmas = [lemmatoken] return lemmas def _segment(self, token): res = self.segmenter.lookup(token['surf']) segmenteds = [] for r in res: segmenttoken = token.copy() segmenttoken['segments'] = r[0] segmenttoken['segmentweight'] = float(r[1]) segmenteds += [segmenttoken] return segmenteds def segment(self, token): '''Segment token into morphs, words and other string pieces. The segments come separated by some internal markers for different segment boundaries. ''' realtoken = self._maybe_str2token(token) segments = None segments = self._segment(realtoken) if not segments or len(segments) < 1: segmenttoken = realtoken.copy() segmenttoken['segments'] = segmenttoken['surf'] segments = [segmenttoken] return segments def _labelsegment(self, token): res = self.labelsegmenter.lookup(token['surf']) lss = [] for r in res: lstoken = token.copy() lstoken['labelsegments'] = r[0] lstoken['lsweight'] = float(r[1]) lss += [lstoken] return lss def labelsegment(self, token): '''Segment token into labelled morphs, words and other string pieces. The segments are suffixed with their morphologically relevant informations, e.g. lexical classes for root lexemes and inflectional features for inflectional segments. This functionality is experimental due to hacky way it was patched together. ''' realtoken = self._maybe_str2token(token) labelsegments = None labelsegments = self._labelsegment(realtoken) if not labelsegments or len(labelsegments) < 1: lstoken = realtoken.copy() lstoken['labelsegments'] = lstoken['surf'] lstoken['lsweight'] = float('inf') labelsegments = [lstoken] return labelsegments def _accept(self, token): """Look up token from acceptor model.""" if self.acceptor: res = self.acceptor.lookup(token['surf']) elif self.analyser: res = self.analyser.lookup(token['surf']) else: res = None return res def accept(self, token): '''Check if the token is in the dictionary or not. Returns False for OOVs, True otherwise. Note, that this is not necessarily more efficient than analyse(token) ''' realtoken = self._maybe_str2token(token) accept = False accepts = None accepts = self._accept(realtoken) if accepts and len(accepts) > 0: accept = True else: accept = False return accept def _generate(self, token): res = self.generator.lookup(token['anal']) generations = [] for r in res: g = token.copy() g['surf'] = r[0] g['genweight'] = r[1] generations += [g] return generations def generate(self, omorstring): '''Generate surface forms corresponding given token description. Currently only supports very direct omor style analysis string generation. For round-tripping and API consistency you can also feed a token dict here. ''' gentoken = {} if isinstance(omorstring, str): gentoken['anal'] = omorstring elif isinstance(omorstring, dict): # for round-tripping gentoken = omorstring else: gentoken = { 'error': 'token not dict or string', 'location': 'generate()' } generateds = None if self.can_generate: generated = self._generate(gentoken) if not generated: gentoken['surf'] = gentoken['anal'] gentoken['genweight'] = float('inf') generated = [gentoken] return generated def _udpipe(self, udinput): conllus = self.udpipeline.process(udinput, self.uderror) if self.uderror.occurred(): return None tokens = [] for conllu in conllus.split('\n'): if conllu.startswith('#'): continue elif conllu.strip() == '': continue tokens += [self._conllu2token(conllu)] return tokens def _conllu2token(self, conllu): fields = conllu.split() if len(fields) != 10: print("conllu2token conllu fail", fields) upos = fields[3] wordid = fields[2] surf = fields[1] ufeats = fields[5] misc = fields[9] analysis = '[WORD_ID=%s][UPOS=%s]%s[GUESS=UDPIPE]' % ( wordid, upos, self._ufeats2omor(ufeats)) token = { 'anal': analysis, 'misc': misc, 'upos': upos, 'surf': surf, 'ufeats': ufeats } return token def _ufeats2omor(self, ufeats): return '[' + ufeats.replace('|', '][') + ']'
sys.stdout = codecs.getwriter(encoding)(sys.stdout) if len(sys.argv) < 4: sys.stderr.write( 'Usage: %s input_format(tokenize|conllu|horizontal|vertical) output_format(conllu) model_file\n' % sys.argv[0]) sys.exit(1) sys.stderr.write('Loading model: ') model = Model.load(sys.argv[3]) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % sys.argv[3]) sys.exit(1) sys.stderr.write('done\n') pipeline = Pipeline(model, sys.argv[1], Pipeline.DEFAULT, Pipeline.DEFAULT, sys.argv[2]) error = ProcessingError() # Read whole input text = ''.join(sys.stdin.readlines()) # Process data processed = pipeline.process(text, error) if error.occurred(): sys.stderr.write("An error occurred when running run_udpipe: ") sys.stderr.write(error.message) sys.stderr.write("\n") sys.exit(1) sys.stdout.write(processed)
class Omorfi: """ An object holding omorfi binaries for all the functions of omorfi. The following functionalities use automata binaries that need to be loaded separately: * analysis * tokenisation * generation * lemmatisation * segmentation * lookup * guess There is python code to perform basic string munging controlled by following bool attributes: try_lowercase: to use `str.lower()` try_titlecase: to use `str[0].upper() + str[1:]` try_uppercase: to use `str.upper()` try_detitlecase: to use `str[0].lower + str[1:]` The annotations will be changed when transformation has been applied. """ #: magic number for penalty weights _penalty = 28021984 def __init__(self, verbosity=False): """Construct Omorfi with given verbosity for printouts.""" self._verbosity = verbosity ## analyser model self.analyser = None ## tokeniser self.tokeniser = None ## generator model self.generator = None ## lemmatising model self.lemmatiser = None ## hyphenating model self.hyphenator = None ## segmenting model self.segmenter = None ## label-segment model self.labelsegmenter = None ## acceptor self.acceptor = None ## guesser model self.guesser = None ## UDPipe model self.udpiper = None ## UDPipeline object :-( self.udpipeline = None ## UDError object :-( self.uderror = None ## database of lexical unigram probabilities self.lexlogprobs = dict() ## database of tag unigram probabilities self.taglogprobs = dict() ## whether to lowercase and re-analyse if needed self.try_lowercase = True ## whether to Titlecase and re-analyse if needed self.try_titlecase = True ## whether to dEtitlecase and re-analyse if needed self.try_detitlecase = True ## whether to dEtitlecase and re-analyse if needed self.try_detitle_firstinsent = True ## whether to UPPERCASE and re-analyse if needed self.try_uppercase = False ## whether accept model is loaded self.can_accept = False ## whether analyser model is loaded self.can_analyse = False ## whether tokenisr model is loaded self.can_tokenise = True ## whether generator model is loaded self.can_generate = False ## whether lemmatising model is loaded self.can_lemmatise = False ## whether hypenation model is loaded self.can_hyphenate = False ## whether segmentation model is loaded self.can_segment = False ## whether label segmentation model is loaded self.can_labelsegment = False ## whether guesser model is loaded self.can_guess = False ## whether UDPipe is loaded self.can_udpipe = False def _load_hfst(self, f): """Load an automaton from file. Args: f: containing single hfst automaton binary. Throws: FileNotFoundError if file is not found """ try: his = libhfst.HfstInputStream(f) return his.read() except libhfst.NotTransducerStreamException: raise IOError(2, f) from None def load_labelsegmenter(self, f): """Load labeled segments model from a file. Args: f: containing single hfst automaton binary. """ self.labelsegmenter = self._load_hfst(f) self.can_labelsegment = True def load_segmenter(self, f): """Load segmentation model from a file. Args: f: containing single hfst automaton binary. """ self.segmenter = self._load_hfst(f) self.can_segment = True def load_analyser(self, f): """Load analysis model from a file. Args f: containing single hfst automaton binary. """ self.analyser = self._load_hfst(f) self.can_analyse = True self.can_accept = True self.can_lemmatise = True def load_generator(self, f): """Load generation model from a file. Args: f: containing single hfst automaton binary. """ self.generator = self._load_hfst(f) self.can_generate = True def load_acceptor(self, f): """Load acceptor model from a file. Args: f: containing single hfst automaton binary. """ self.acceptor = self._load_hfst(f) self.can_accept = True def load_tokeniser(self, f): """Load tokeniser model from a file. Args: f: containing single hfst automaton binary. """ self.tokeniser = self._load_hfst(f) self.can_tokenise = True def load_lemmatiser(self, f): """Load lemmatiser model from a file. Args: f: containing single hfst automaton binary. """ self.tokeniser = self._load_hfst(f) self.can_lemmatise = True def load_hyphenator(self, f): """Load hyphenator model from a file. Args: f: containing single hfst automaton binary. """ self.hyphenator = self._load_hfst(f) self.can_hyphenate = True def load_guesser(self, f): """Load guesser model from a file. Args: f: containing single hfst automaton binary. """ self.guesser = self._load_hfst(f) self.can_guess = True def load_udpipe(self, filename: str): """Load UDPipe model for statistical parsing. UDPipe can be used as extra information source for OOV symbols or all tokens. It works best with sentence-based analysis, token based does not keep track of context. @param filename path to UDPipe model """ if not can_udpipe: print("importing udpipe failed, cannot load udpipe xxx") return self.udpiper = Model.load(filename) # use pipeline for now, ugly but workable self.udpipeline = Pipeline(self.udpiper, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') self.uderror = ProcessingError() ## udpipe is loaded self.can_udpipe = True def load_lexical_frequencies(self, lexfile): """Load a frequency list for lemmas. Experimental. Currently in uniq -c format, subject to change. Args: lexfile: file with frequencies. """ lextotal = 0 lexcounts = dict() for line in lexfile: fields = line.split('\t') lexcount = int(fields[0]) lexcounts[fields[1]] = lexcount lextotal += lexcount for lex, freq in lexcounts.items(): if freq != 0: self.lexlogprobs[lex] = log(freq / lextotal) else: # XXX: hack hack, should just use LM count stuff with # discounts self.lexlogprobs[lex] = log(1 / (lextotal + 1)) def load_omortag_frequencies(self, omorfile): """Load a frequenc list for tags. Experimental. Currently in uniq -c format. Subject to change. Args: omorfile: path to file with frequencies. """ omortotal = 0 omorcounts = dict() for line in omorfile: fields = line.split('\t') omorcount = int(fields[0]) omorcounts[fields[1]] = omorcount omortotal += omorcount for omor, freq in omorcounts.items(): if freq != 0: self.taglogprobs[omor] = log(freq / omortotal) else: # XXX: hack hack, should just use LM count stuff with # discounts self.taglogprobs[omor] = log(1 / (omortotal + 1)) def _find_retoken_recase(self, token: Token): """Checks if token is acceptable when case is ignored. Used case-ignorations depend on the settings. Args: token: token to recase Returns: recased token or False if no retoken is possible """ if self.accept(token): return token if len(token.surf) > 1: # we test len to just use 1: slice... if self.try_titlecase and not token.surf[0].isupper(): if self.accept( Token(token.surf[0].upper() + token.surf[1:].lower())): return token if self.try_detitlecase and not token.surf[0].islower(): if self.accept(Token(token.surf[0].lower() + token.surf[1:])): return token if self.try_lowercase: if self.accept(Token(token.surf.lower())): return token if self.try_uppercase: if self.accept(Token(token.surf.upper())): return token return False def _find_retokens(self, token: Token): """Finds list of acceptable sub-tokens from a token. Tries to strip punct tokens from left and right. Args: token: token to retokenise Returns: list of tokens giving best retokenisation """ retoken = self._find_retoken_recase(token) if retoken: return [retoken] pretokens = [] posttokens = [] for i in range(4): for j in range(4): if len(token.surf) > (i + j): if j == 0: resurf = token.surf[i:] else: resurf = token.surf[i:-j] presurfs = token.surf[:i] postsurfs = token.surf[-j:] pretrailpuncts = True for c in presurfs: if c in fin_punct_leading: pretoken = Token(c) pretoken.spaceafter = 'No' pretokens.append(pretoken) else: pretrailpuncts = False break for c in postsurfs: if c in fin_punct_trailing: posttoken = Token(c) posttoken.spacebefore = 'No' posttokens.append(posttoken) else: pretrailpuncts = False break if not pretrailpuncts: continue retoken = Token(resurf) reretoken = self._find_retoken_recase(retoken) if reretoken: return pretokens + [reretoken] + posttokens # no acceptable substring inside, just strip puncts return [token] def _retokenise(self, tokens: list): """Takes list of string from and produces list of tokens. May change number of tokens. Should be used with result of split(). Args: tokens: list of tokens to retokenise Returns: list of tokens representing best tokenisations of tokens """ retokens = [] for s in tokens: token = Token(s) for retoken in self._find_retokens(token): retokens.append(retoken) return retokens def fsa_tokenise(self, line: str): """Tokenise with FSA. Args: line: string to tokenise Todo: Not implemented (needs pmatch python support) """ return None def python_tokenise(self, line: str): """Tokenise with python's basic string functions. Args: line: string to tokenise """ return self._retokenise(line.split()) def tokenise(self, line: str): """Perform tokenisation with loaded tokeniser if any, or `split()`. If tokeniser is available, it is applied to input line and if result is achieved, it is split to tokens according to tokenisation strategy and returned as a list. If no tokeniser are present, or none give results, the line will be tokenised using python's basic string functions. If analyser is present, tokeniser will try harder to get some analyses for each token using hard-coded list of extra splits. Args: line: a string to be tokenised, should contain a line of text or a sentence Returns: A list of tokens based on the line. List may include boundary non-tokens if e.g. sentence boundaries are recognised. For empty line a paragraph break non-token may be returned. """ tokens = None if self.tokeniser: tokens = self.fsa_tokenise(line) if not tokens: tokens = self.python_tokenise(line) return tokens def _analyse(self, token: Token): '''Analyse token using HFST and perform recasings. Args: token: token to analyse''' # use real surface case newanals = list() res = self.analyser.lookup(token.surf) for r in res: omor = r[0] + '[WEIGHT=%f]' % (r[1]) weight = r[1] newanals.append(Analysis.fromomor(omor, weight)) if token.pos == 1 and token.surf[0].isupper()\ and len(token.surf) > 1: res = self.analyser.lookup(token.surf[0].lower() + token.surf[1:]) for r in res: omor = r[0] + '[WEIGHT=%f]' % (r[1]) weight = r[1] newanals.append(Analysis.fromomor(omor, weight)) for a in newanals: token.analyses.append(a) return newanals def analyse(self, token: Token): """Perform a simple morphological analysis lookup. The analysis will be performed for re-cased variants based on the state of the member variables. The re-cased analyses will have more penalty weight and additional analyses indicating the changes. Side-Effects: The analyses are stored in the token, and only the new analyses are returned. Args: token: token to be analysed. Returns: An HFST structure of raw analyses, or None if there are no matches in the dictionary. """ if isinstance(token, str): token = Token(token) anals = self._analyse(token) if not anals: omor = '[WORD_ID=' + token.surf + '][UPOS=X]' +\ '[GUESS=UNKNOWN][WEIGHT=inf]' weight = float('inf') anal = Analysis.fromomor(omor, weight) anal.manglers.append("GUESSER=NONE") token.analyses.append(anal) return [anal] return anals def analyse_sentence(self, s): """Analyse a full sentence with tokenisation and guessing. for details of tokenisation, see @c tokenise(self, s). for details of analysis, see @c analyse(self, token). If further models like udpipe are loaded, may fill in gaps with that. """ tokens = self.tokenise_sentence(s) if not tokens: errortoken = Token() errortoken.nontoken = "error" errortoken.error = "cannot tokenise sentence" errortoken.comment = { "sentence": s, "location": "analyse_sentence" } return [errortoken] analysis_lists = [] i = 0 for token in tokens: i += 1 analysis_lists[i] += [self.analyse(token)] if self.can_udpipe: # N.B: I used the vertical input here udinput = '\n'.join([token.surf for token in tokens]) uds = self._udpipe(udinput) if len(uds) == len(analysis_lists): for i, ud in enumerate(uds): analysis_lists[i] += [ud] return None def _guess_token(self, token: Token): '''Guess token reading using language models. Args: token: token to guess''' res = self.guesser.lookup(token.surf) for r in res: anal = r[0] + '[GUESS=FSA][WEIGHT=%f]' % (r[1]) weight = float(r[1]) guess = Analysis.fromomor(anal, weight) guess.manglers.append("GUESSER=FSA") token.analyses.append(guess) return res def _guess_heuristic(self, token: Token): '''Heuristic guessing function written fully in python. This is kind of last resort, but has some basic heuristics that may be always useful. Args: token: token to guess Returns: list: new analyses guessed ''' # woo advanced heuristics!! newanals = list() s = token.surf trieds = {s} if len(s) > 2 and (s[0].islower() or s[1].isupper()) and \ self.try_titlecase: tcs = s[0].upper() + s[1:].lower() if tcs not in trieds: tcres = self.analyser.lookup(tcs) for r in tcres: mangler = 'Titlecased' omor = r[0] + \ '[CASECHANGE=TITLECASED]' + \ '[WEIGHT=%f]' % (r[1] + self._penalty) weight = r[1] + self._penalty anal = Analysis.fromomor(omor, weight) anal.manglers.append(mangler) anal.analsurf = tcs newanals.append(anal) trieds.add(tcs) if len(s) > 2 and s[0].isupper() and self.try_detitlecase: dts = s[0].lower() + s[1:] if dts not in trieds: dtres = self.analyser.lookup(dts) for r in dtres: mangler = 'dETITLECASED' omor = r[0] + \ "[CASECHANGE=DETITLECASED]" + \ "[WEIGHT=%f]" % (r[1] + self._penalty) weight = r[1] if token.pos != 1: weight += self._penalty anal = Analysis.fromomor(omor, weight) anal.manglers.append(mangler) anal.analsurf = dts newanals.append(anal) trieds.add(dts) if not s.isupper() and self.try_uppercase: ups = s.upper() if ups not in trieds: upres = self.analyser.lookup(ups) for r in upres: mangler = 'UPPERCASED' omor = r[0] + \ "[CASECHANGE=UPPERCASED]" + \ "[WEIGHT=%f]" % (r[1] + self._penalty) weight = r[1] + self._penalty anal = Analysis.fromomor(omor, weight) anal.manglers.append(mangler) anal.analsurf = ups newanals.append(anal) trieds.add(ups) if not s.islower() and self.try_lowercase: lows = s.lower() if lows not in trieds: lowres = self.analyser.lookup(lows) for r in lowres: mangler = 'lowercased' omor = r[0] +\ "[CASECHANGE=LOWERCASED]" + \ "[WEIGHT=%f]" % (r[1] + self._penalty) weight = r[1] + self._penalty anal = Analysis.fromomor(omor, weight) anal.manglers.append(mangler) anal.analsurf = lows newanals.append(anal) trieds.add(lows) if not newanals: if len(token.surf) == 1: omor = '[WORD_ID=' + token.surf +\ "][UPOS=SYM][GUESS=HEUR]" +\ "[WEIGHT=%f]" % (self._penalty) weight = self._penalty guess = Analysis.fromomor(omor, weight) guess.manglers.append('GUESSER=PYTHON_LEN1') newanals.append(guess) elif token.surf[0].isupper() and len(token.surf) > 1: omor = '[WORD_ID=' + token.surf +\ "][UPOS=PROPN][NUM=SG][CASE=NOM][GUESS=HEUR]" +\ "[WEIGHT=%f]" % (self._penalty) weight = self._penalty guess = Analysis.fromomor(omor, weight) guess.manglers.append('GUESSER=PYTHON_0ISUPPER') newanals.append(guess) else: omor = '[WORD_ID=' + token.surf +\ "][UPOS=NOUN][NUM=SG][CASE=NOM][GUESS=HEUR]" +\ "[WEIGHT=%f]" % (self._penalty) weight = self._penalty guess = Analysis.fromomor(omor, weight) guess.manglers.append('GUESSER=PYTHON_ELSE') newanals.append(guess) for anal in newanals: token.analyses.append(anal) return newanals def guess(self, token: Token): '''Speculate morphological analyses of OOV token. This method may use multiple information sources, but not the actual analyser. Therefore a typical use of this is after the analyse(token) function has failed. Note that some information sources perform badly when guessing without context, for these the analyse_sentence(sent) is the only option. Side-effect: This operation stores guesses in token for future use as well as returning them. Args: token: token to analyse with guessers. Returns: New guesses as a list of Analysis objects. ''' guesses = self._guess_heuristic(token) if self.can_udpipe: guesses += [self._udpipe(token.surf)] if self.can_guess: guesses += self._guess_token(token) return guesses def _lemmatise(self, token): res = self.lemmatiser.lookup(token.surf) newlemmas = list() for r in res: lemma = r[0] weight = float(r[1]) anal = Analysis() anal.raw = lemma anal.rawtype = "lemma" anal.weight = weight newlemmas.append(anal) for lemma in newlemmas: token.lemmatisations.append(lemma) return newlemmas def lemmatise(self, token: Token): '''Lemmatise token, splitting it into valid word id's from lexical db. Side-effect: This operation stores lemmas in the token for future use and only returns HFST structures. Use Token's method's to retrieve tokens in pythonic structures. Args: token: token to lemmatise Returns: New lemmas in analysis list ''' lemmas = None lemmas = self._lemmatise(token) if not lemmas or len(lemmas) < 1: lemma = token.surf weight = float('inf') guess = Analysis() guess.raw = lemma guess.rawtype = "lemma" guess.ewight = weight guess.manglers.append("GUESSER=SURFISLEMMA") token.lemmatisations.append(guess) lemmas = [guess] return lemmas def _segment(self, token: Token): '''Intenal segmenting using HFST automaton. Args: token: token to segment.''' res = self.segmenter.lookup(token.surf) newsegs = list() for r in res: segments = r[0] weight = float(r[1]) anal = Analysis() anal.raw = segments anal.weight = weight anal.rawtype = "segments" newsegs.append(anal) for ns in newsegs: token.segmentations.append(ns) return newsegs def segment(self, token: Token): '''Segment token into morphs, words and other string pieces. Side-effect: this operation stores segments in the token for future use and only returns the HFST structures. To get pythonic data use Token's methods afterwards. Args: token: token to segment Returns: New segmentations in analysis list ''' segments = None segments = self._segment(token) if not segments or len(segments) < 1: segments = token.surf weight = float('inf') guess = Analysis() guess.raw = segments guess.weight = weight guess.rawtype = "segments" guess.manglers.append("GUESSER=SURFISSEGMENT") token.segmentations.append(guess) segments = [guess] return segments def _labelsegment(self, token: Token): '''Internal implementation of segment label lookup with FSA. Args: token: token to analyse Returns: list of new labelsegment analyses.''' res = self.labelsegmenter.lookup(token.surf) newlabels = list() for r in res: labelsegments = r[0] weight = float(r[1]) anal = Analysis() anal.raw = labelsegments anal.weight = weight anal.rawtype = "labelsegments" newlabels.append(anal) for ls in newlabels: token.labelsegmentations.append(ls) return newlabels def labelsegment(self, token: Token): '''Segment token into labelled morphs, words and other string pieces. The segments are suffixed with their morphologically relevant informations, e.g. lexical classes for root lexemes and inflectional features for inflectional segments. This functionality is experimental due to hacky way it was patched together. Side-effect: Note that this operation stores the labelsegments in the token for future use, and only returns raw HFST structures. To get pythonic you can use Token's methods afterwards. Args: token: token to segment with labels Returns: New labeled segemntations in analysis list. ''' labelsegments = None labelsegments = self._labelsegment(token) if not labelsegments or len(labelsegments) < 1: labelsegments = token.surf + "|UNK" lsweight = float('inf') guess = Analysis() guess.raw = labelsegments guess.weight = lsweight guess.rawtype = "labelsegments" guess.manglers.append("GUESSER=SURFISLABELS") token.labelsegmentations.append(guess) labelsegments = [guess] return labelsegments def _accept(self, token: Token): """Look up token from acceptor model. Args: token: token to accept Returns: analyses of token""" if self.acceptor: res = self.acceptor.lookup(token.surf) elif self.analyser: res = self.analyser.lookup(token.surf) else: res = None return res def accept(self, token): '''Check if the token is in the dictionary or not. Returns: False for OOVs, True otherwise. Note, that this is not necessarily more efficient than bool(analyse(token)) ''' return bool(self._accept(token)) def _generate(self, s: str): '''Generate surface forms from string using FSA model. Args: s: string matching raw omor analysis Returns: string containing surface forms ''' res = self.generator.lookup(s) generations = [] for r in res: generations += [r[0]] return "/".join(generations) def generate(self, omorstring: str): '''Generate surface forms corresponding given token description. Currently only supports very direct omor style analysis string generation. Args: omorstring: Omorfi analysis string to generate Returns A surface string word-form, or the omorstring argument if generation fails. Or None if generator is not loaded. ''' generated = None if self.can_generate: generated = self._generate(omorstring) if not generated: return omorstring return generated def _udpipe(self, udinput: str): """Pipes input to udpipe model. Args: udinput: input for udpipe Returns: tokens with udpipe analyses """ conllus = self.udpipeline.process(udinput, self.uderror) if self.uderror.occurred(): return None tokens = [] for conllu in conllus.split('\n'): if conllu.startswith('#'): continue elif conllu.strip() == '': continue tokens += [Token.fromconllu(conllu)] return tokens def tokenise_sentence(self, sentence: str): '''tokenise a sentence. To be used when text is already sentence-splitted. If the text is plain text with sentence boundaries within lines, use Args: sentence: a string containing one sentence Returns: list of tokens in sentence ''' if not sentence or sentence == '': token = Token() token.nontoken = "separator" token.comment = '' return [token] tokens = self.tokenise(sentence) pos = 1 for token in tokens: token.pos = pos pos += 1 return tokens def tokenise_plaintext(self, f): '''tokenise a whole text. Args: f: filelike object with iterable strings Returns: list of tokens ''' tokens = list() for line in f: tokens = self.tokenise(line.strip()) pos = 1 for token in tokens: token.pos = pos pos += 1 sep = Token() sep.nontoken = "separator" tokens.append(sep) return tokens eoft = Token() eoft.nontoken = "eof" tokens.append(eoft) return tokens def tokenise_conllu(self, f): '''tokenise a conllu sentence or comment. Should be used a file-like iterable that has CONLL-U sentence or comment or empty block coming up. Args: f: filelike object with iterable strings Returns: list of tokens ''' tokens = list() for line in f: fields = line.strip().split('\t') token = Token() if len(fields) != 10: if line.startswith('#'): token.nontoken = "comment" token.comment = line.strip() tokens.append(token) return tokens elif line.strip() == '': token.nontoken = "separator" token.comment = '' tokens.append(token) return tokens else: token.nontoken = "error" token.error = line.strip() tokens = [token] return tokens token._conllu = fields try: index = int(fields[0]) except ValueError: if '-' in fields[0]: # MWE continue elif '.' in fields[0]: # a ghost continue else: print("Cannot figure out token index", fields[0], file=stderr) exit(1) token.pos = index token.surf = fields[1] if fields[9] != '_': miscs = fields[9].split('|') for misc in miscs: k, v = misc.split('=') if k == 'SpaceAfter': token.spaceafter = v elif k in [ 'Alt', 'FTB-PronType', 'FTB-Rel', 'Missed-Rel', 'FTB-rel', 'Join', 'Missed-SUBCAT', 'FTB-Sub', 'Prefix', 'FTB1-InfForm', 'Missed-POSITION' ]: # FTB stuff pass else: print("Unknown MISC", k, file=stderr) exit(1) tokens.append(token) eoft = Token() eoft.nontoken = "eof" tokens.append(eoft) return tokens def tokenise_vislcg(self, f): '''Tokenises a sentence from VISL-CG format data. Returns a list of tokens when it hits first non-token block, including a token representing this non-token block. Args: f: filelike object to itrate strings of vislcg data Returns: list of tokens ''' tokens = list() pos = 1 for line in f: token = Token() line = line.strip() if not line or line == '': token.nontoken = "separator" token.comment = '' tokens.append(token) return tokens elif line.startswith("#") or line.startswith("<"): # # comment, or # <TAG> </TAG> token.nontoken = "comment" token.comment = line.strip() tokens.append(token) return tokens elif line.startswith('"<') and line.endswith('>"'): # "<surf>" token = Token() token.surf = line[2:-2] tokens.append(token) pos += 1 elif line.startswith('\t"'): # \t"lemma" ANAL ANAL ANAL fields = line.strip().split() token.lemma = fields[0].strip('"') elif line.startswith(';\t"'): # ;\t"lemma" ANAL ANAL ANAL KEYWORD:rulename token.nontoken = "gold" token.comment = line.strip() else: token.nontoken = "error" token.error = 'vislcg: ' + line.strip() eoft = Token() eoft.nontoken = "eof" tokens.append(eoft) return tokens
class UDpipeparser(): def __init__(self, modeldir): self.modeldir = os.path.abspath(modeldir) self.model = Model.load(self.modeldir) self.error = ProcessingError() if not self.model: sys.stderr.write("Udpipe language model loading failed:" + self.modeldir) sys.exit(1) self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") def udpipe_parse_sent(self, text): logger = logging.getLogger('petr_log') processed = self.pipeline.process(text, self.error) if self.error.occurred(): raise ValueError(self.error.message) #print(processed) parsed = [] for line in processed.split("\n"): if line.startswith("#"): continue if "VERB\t" not in line: line = unidecode.unidecode(line) parsed.append(line) #print(("\n").join(parsed)) #input(" ") return ("\n").join(parsed) def udpipe_parse_events(self, events): logger = logging.getLogger('petr_log') #modeldir = os.path.abspath(PETRglobals.udpipe_modsel_dir) #self.model = Model.load(modeldir) #self.pipeline = Pipeline(model,"tokenize",Pipeline.DEFAULT,Pipeline.DEFAULT,"conllu") total = len(list(event_dict.keys())) logger.info("Starting parse of {} stories.'.format(total)") for i, key in enumerate(event_dict.keys()): if (i / float(total)) * 100 in [10.0, 25.0, 50, 75.0]: print('Parse is {}% complete...'.format( (i / float(total)) * 100)) for sent in event_dict[key]['sents']: logger.info('Udpipe parsing {}_{}...'.format(key, sent)) sent_dict = event_dict[key]['sents'][sent] processed = self.pipeline.process(sent_dict['content'], self.error) if self.error.occurred(): #sys.stderr.write("An error occurred when running run_udpipe: ") #sys.stderr.write(error.message) #sys.stderr.write("\n") raise ValueError(self.error.message) parsed = [] for line in processed.split("\n"): if line.startswith("#"): continue parsed.append(line) sent_dict['parsed'] = utilities._format_ud_parsed_str( ("\n").join(parsed)) print(sent_dict['parsed']) input("") #print('Done with UDpipe parse...\n\n') logger.info('Done with UDpipe parse.') return event_dict
class Omorfi: """ An object holding omorfi binariesfor all the functions of omorfi. The following functionalities use automata binaries that need to be loaded separately: * analysis * tokenisation * generation * lemmatisation * segmentation * lookup * guess There is python code to perform basic string munging controlled by following bool attributes: try_lowercase: to use `str.lower()` try_titlecase: to use `str[0].upper() + str[1:]` try_uppercase: to use `str.upper()` try_detitlecase: to use `str[0].lower + str[1:]` The annotations will be changed when transformation has been applied. """ analyser = None tokeniser = None generator = None lemmatiser = None hyphenator = None segmenter = None labelsegmenter = None acceptor = None guesser = None udpiper = None udpipeline = None uderror = None try_lowercase = True try_titlecase = True try_detitlecase = True try_uppercase = False can_analyse = False can_tokenise = True can_generate = False can_lemmatise = False can_hyphenate = False can_segment = False can_labelsegment = False can_guess = False can_udpipe = False _verbosity = False _stdpaths = [ '/usr/local/share/hfst/fi/', '/usr/share/hfst/fi/', '/usr/local/share/omorfi/', '/usr/share/omorfi/', './', 'generated/', 'src/generated/', '../src/generated/' ] def __init__(self, verbosity=False): """Construct Omorfi with given verbosity for printouts.""" self._verbosity = verbosity def load_filename(self, path, **include): """Load omorfi automaton from filename and guess its use. A file name should consist of three parts separated by full stop. The second part must be a keyword describing the use of the automaton, first part is parsed as an identifier typically starting with the word omorfi, followed by any extras, such as the tagset for analysis or generation. The named arguments can include a name of automaton type as name, and truth value as value, for types of automata allowed to load. By default, the names `analyse`, `generate` and `segment` are loaded. Names not included are defaulted to False. E.g., `omorfi.load_filename(fn, analyse=True)` will only load file named fn if it can be identified as omorfi analyser. This is best used in conjunction with omorfi.load_from_dir. """ if len(include) == 0: include['analyse'] = True include['generate'] = True include['segment'] = True include['accept'] = True for ttype in [ 'analyse', 'generate', 'accept', 'tokenise', 'lemmatise', 'hyphenate', 'segment', 'labelsegment', 'guesser', 'udpipe' ]: if ttype not in include: include[ttype] = False his = None if self._verbosity: print('Opening file', path) if access(path, F_OK): his = libhfst.HfstInputStream(path) else: # FIXME: should fail if self._verbosity: print('No access to ', path, file=stderr) pass parts = path[path.rfind('/') + 1:path.rfind('.')].split('.') if len(parts) != 2: if self._verbosity: print('not loaded', path) elif not parts[0] == 'omorfi': if self._verbosity: print('not omorfi', path) elif parts[1] == 'analyse' and include['analyse']: if self._verbosity: print('analyser', parts[0]) self.analyser = his.read() self.can_analyse = True self.can_accept = True self.can_lemmatise = True elif parts[1] == 'generate' and include['generate']: if self._verbosity: print('generator', parts[0]) self.generator = his.read() self.can_generate = True elif parts[1] == 'accept' and include['accept']: if self._verbosity: print('acceptor', parts[0]) self.acceptor = his.read() self.can_accept = True elif parts[1] == 'tokenise' and include['tokenise']: if self._verbosity: print('tokeniser', parts[0]) self.tokeniser = his.read() self.can_tokenise = True elif parts[1] == 'lemmatise' and include['lemmatise']: if self._verbosity: print('lemmatiser', parts[0]) self.lemmatiser = his.read() self.can_lemmatise = True elif parts[1] == 'hyphenate' and include['hyphenate']: if self._verbosity: print('hyphenator', parts[0]) self.hyphenator = his.read() self.can_hyphenate = True elif parts[1] == 'segment' and include['segment']: if self._verbosity: print('segmenter', parts[0]) self.segmenter = his.read() self.can_segment = True elif parts[1] == 'guesser' and include['guesser']: if self._verbosity: print('guesser', parts[0]) self.guesser = his.read() self.can_guess = True elif parts[1] == 'labelsegment' and include['labelsegment']: if self._verbosity: print('labelsegmenter', parts[0]) self.labelsegmenter = his.read() self.can_segment = True elif self._verbosity: print('skipped', parts) def _maybe_str2token(self, s): if isinstance(s, str): return (s, "") else: return s def load_from_dir(self, path=None, **include): """Load omorfi automata from given or known locations. If path is given it should point to directory of automata, otherwise standard installation paths are tried. Currently standard linux install paths are all globbed in following order: * /usr/local/share/hfst/fi/*.hfst * /usr/share/hfst/fi/*.hfst * /usr/local/share/omorfi/*.hfst * /usr/share/omorfi/*.hfst * getenv('HOME') + /.hfst/fi/*.hfst * getenv('HOME') + /.omorfi/*.hfst Last two paths require getenv('HOME'). All automata matching glob *.hfst are loaded and stored in part of omorfi class appropriate for their usage. They keyword args can be used to limit loading of automata. The name is analyser type and value is True. """ homepaths = [] if getenv('HOME'): home = getenv('HOME') homepaths = [home + '/.hfst/fi/', home + '/.omorfi/'] loadable = [] if path: if self._verbosity: print('adding', path + '/*.hfst') loadable = glob(path + '/*.hfst') else: for sp in self._stdpaths + homepaths: if self._verbosity: print('adding', sp + '/*.hfst') loadable += glob(sp + '/*.hfst') for filename in loadable: try: self.load_filename(filename, **include) except: print("broken HFST", filename, file=stderr) def load_udpipe(self, filename): if not can_udpipe: print("importing udpipe failed, cannot load udpipe xxx") return self.udpiper = Model.load(filename) # use pipeline for now, ugly but workable self.udpipeline = Pipeline(self.udpiper, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') self.uderror = ProcessingError() self.can_udpipe = True def _find_retoken_recase(self, token): if self.accept(token): return (token, "ORIGINALCASE") if self.try_lowercase and self.accept(token.lower()): return (token.lower(), "LOWERCASED=" + token) if self.try_uppercase and self.accept(token.upper()): return (token.upper(), "UPPERCASED=" + token) if len(token) > 1: if self.try_titlecase and self.accept(token[0].upper() + token[1:]): return (token[0].upper() + token[1:], "TITLECASED=" + token) if self.try_detitlecase and self.accept(token[0].lower() + token[1:]): return (token[0].lower() + token[1:], "DETITLECASED=" + token) return False def _find_retokens(self, token): retoken = self._find_retoken_recase(token) if retoken: return [retoken] # Word. if token[-1] in fin_punct_trailing: retoken = self._find_retoken_recase(token[:-1]) if retoken: return [(retoken[0], retoken[1] + "|SpaceAfter=No"), (token[-1], "SpaceBefore=No")] # -Word if token[0] in fin_punct_leading: retoken = self._find_retoken_recase(token[1:]) if retoken: return [(token[0], "SpaceAfter=No"), (retoken[0], retoken[1] + "|SpaceBefore=No")] # "Word" if token[0] in fin_punct_leading and token[-1] in fin_punct_trailing: retoken = self._find_retoken_recase(token[1:-1]) if retoken: return [(token[0], "SpaceAfter=No"), (retoken[0], retoken[1] + "|SpaceBefore=No|SpaceAfter=No"), (token[-1], "SpaceBefore=No")] # word." or word", if len(token) > 2 and token[-1] in fin_punct_trailing and token[ -2] in fin_punct_trailing: retoken = self._find_retoken_recase(token[:-2]) if retoken: return [(retoken[0], retoken[1] + "|SpaceAfter=No"), (token[-2], "SpaceBefore=No|SpaceAfter=No"), (token[-1], "SpaceBefore=No")] # word.", if len(token) > 3 and token[-1] in fin_punct_trailing and token[ -2] in fin_punct_trailing and token[-3] in fin_punct_trailing: retoken = self._find_retoken_recase(token[:-3]) if retoken: return [(retoken[0], retoken[1] + "|SpaceAfter=No"), (token[-3], "SpaceBefore=No|SpaceAfter=No"), (token[-2], "SpaceBefore=No|SpaceAfter=No"), (token[-1], "SpaceBefore=No")] # "word." if len(token) > 3 and token[-1] in fin_punct_trailing and token[ -2] in fin_punct_trailing and token[0] in fin_punct_leading: retoken = self._find_retoken_recase(token[1:-2]) if retoken: return [(token[0], "SpaceAfter=No"), (retoken[0], retoken[1] + "|SpaceBefore=No|SpaceAfter=No"), (token[-2], "SpaceBefore=No|SpaceAfter=No"), (token[-1], "SpaceBefore=No")] # "word.", if len(token) > 4 and token[-1] in fin_punct_trailing and token[ -2] in fin_punct_trailing and token[ -3] in fin_punct_trailing and token[0] in fin_punct_leading: retoken = self._find_retoken_recase(token[1:-3]) if retoken: return [(token[0], "SpaceAfter=No"), (retoken[0], retoken[1] + "|SpaceBefore=No|SpaceAfter=No"), (token[-3], "SpaceBefore=No|SpaceAfter=No"), (token[-2], "SpaceBefore=No|SpaceAfter=No"), (token[-1], "SpaceBefore=No")] # ...non-word... pretokens = [] posttokens = [] while len(token) > 1 and token[-1] in fin_punct_trailing: posttokens += [(token[-1], "SpaceBefore=No")] token = token[:-1] while len(token) > 1 and token[0] in fin_punct_leading: pretokens += [(token[0], "SpaceAfter=No")] token = token[1:] return pretokens + \ [(token, "SpaceBefore=No|SpaceAfter=No")] + \ posttokens def _retokenise(self, tokens): retokens = [] for token in tokens: for retoken in self._find_retokens(token): retokens.append(retoken) return retokens def _tokenise(self, line): return None def tokenise(self, line): """Perform tokenisation with loaded tokeniser if any, or `split()`. If tokeniser is available, it is applied to input line and if result is achieved, it is split to tokens according to tokenisation strategy and returned as a list. If no tokeniser are present, or none give results, the line will be tokenised using python's basic string functions. If analyser is present, tokeniser will try harder to get some analyses for each token using hard-coded list of extra splits. """ tokens = None if self.tokeniser: tokens = self._tokenise(line) if not tokens: tokens = self._retokenise(line.split()) return tokens def _analyse_str(self, s): token = (s, "") res = self._analyse_token(token) if len(s) > 2 and s[0].islower() and self.try_titlecase: tcs = s[0].upper() + s[1:] if s != tcs: tctoken = (tcs, 'TitleCased=' + s) tcres = self._analyse_token(tctoken) for r in tcres: r = (r[0] + '[CASECHANGE=TITLECASED]', r[1]) res = res + tcres if len(token) > 2 and token[0].isupper() and self.try_detitlecase: dts = s[0].lower() + s[1:] if dts != s: dttoken = (dts, "DetitleCased=" + s) dtres = self._analyse_token(dttoken) for r in dtres: r = (r[0] + '[CASECHANGE=DETITLECASED]', r[1]) res = res + dtres if not s.isupper() and self.try_uppercase: ups = s.upper() if s != ups: uptoken = (ups, "UpperCased=" + s) upres = self._analyse_token(uptoken) for r in upres: r = (r[0] + '[CASECHANGE=UPPERCASED]', r[1]) res = res + upres if not s.islower() and self.try_lowercase: lows = s.lower() if s != lows: lowtoken = (lows, "LowerCased=" + s) lowres = self._analyse_token(lowtoken) for r in lowres: r = (r[0] + '[CASECHANGE=LOWERCASED]', r[1]) res += lowres return res def _analyse_token(self, token): res = self.analyser.lookup(token[0]) for r in res: r = (r[0] + '[WEIGHT=%f]' % (r[1]), r[1], token[1]) return res def analyse(self, token): """Perform a simple morphological analysis lookup. If try_titlecase does not evaluate to False, the analysis will also be performed with first letter uppercased and rest lowercased. If try_uppercase evaluates to not False, the analysis will also be performed on all uppercase variant. If try_lowercase evaluates to not False, the analysis will also be performed on all lowercase variant. The analyses with case mangling will have an additional element to them identifying the casing. """ anals = None if isinstance(token, str): anals = self._analyse_str(token) else: anals = self._analyse_token(token) if not anals: if isinstance(token, str): anal = ('[WORD_ID=%s][GUESS=UNKNOWN][WEIGHT=inf]' % (token), float('inf'), "Unknown") else: anal = ('[WORD_ID=%s][GUESS=UNKNOWN][WEIGHT=inf]' % (token[0]), float('inf'), "Unknown") anals = [anal] return anals def analyse_sentence(self, s): """Analyse a full sentence with tokenisation and guessing. for details of tokenisation, see @c tokenise(self, s). for details of analysis, see @c analyse(self, token). If further models like udpipe are loaded, may fill in gaps with that. """ tokens = self.tokenise(s) if not tokens: tokens = [(s, "ERRORS=analyse_sentence_1")] analyses = [] for token in tokens: analyses += [self.analyse(token)] if self.can_udpipe: udinput = '\n'.join([token[0] for token in tokens]) uds = self.udpipe(udinput) if len(uds) == len(analyses): for i in range(len(uds)): analsyses[i] += [uds[i]] return None def _guess_str(self, s): token = (s, "") return self._guess_token(token) def _guess_token(self, token): res = self.guesser.lookup(token[0]) for r in res: r = (r[0] + '[GUESS=FSA][WEIGHT=%f]' % (r[1]), r[1], token[1]) return res def _guess_heuristic(self, token): guess = (token[0], float('inf'), token[1]) if token[0][0].isupper() and len(token[0]) > 1: guess = (token[0] + "[UPOS=PROPN][NUM=SG][CASE=NOM][GUESS=HEUR]" + "[WEIGHT=28021984]", 28021984, token[1]) else: guess = (token[0] + "[UPOS=NOUN][NUM=SG][CASE=NOM][GUESS=HEUR]" + "[WEIGHT=28021984]", 28021984, token[1]) return [guess] def guess(self, token): if not self.can_guess: if self.can_udpipe: return self._udpipe(token[0]) else: return self._guess_heuristic(self._maybe_str2token(token)) guesses = None if isinstance(token, str): guesses = self._guess_str(token) else: guesses = self._guess_token(token) return guesses def _lemmatise(self, token): res = self.lemmatiser.lookup(token) return res def lemmatise(self, token): lemmas = None lemmas = self._lemmatise(token) if not lemmas: lemma = (token, float('inf')) lemmas = [lemma] return lemmas def _segment(self, token): res = self.segmenter.lookup(token) return res def segment(self, token): segments = None segments = self._segment(token) if not segments: segment = (token, float('inf')) segments = [segment] return segments def _labelsegment(self, token): res = self.labelsegmenter.lookup(token) return res def labelsegment(self, token): labelsegments = None labelsegments = self._labelsegment(token) if not labelsegments: labelsegment = (token, float('inf')) labelsegments = [labelsegment] return labelsegments def _accept(self, token): res = self.acceptor.lookup(token) return res def accept(self, token): accept = False accepts = None accepts = self._accept(token) if accepts: accept = True return accept def _generate(self, omorstring): res = self.generator.lookup(omorstring) return res def generate(self, omorstring): generated = None if self.can_generate: generated = self._generate(omorstring) if not generated: generated = [(omorstring, float('inf'))] return generated def _udpipe(self, udinput): conllus = self.udpipeline.process(udinput, self.uderror) if self.uderror.occurred(): return None tokens = [] for conllu in conllus.split('\n'): if conllu.startswith('#'): continue elif conllu.strip() == '': continue tokens += [self._conllu2token(conllu)] return tokens def _conllu2token(self, conllu): fields = conllu.split() if len(fields) != 10: print("conllu2token conllu fail", fields) upos = fields[3] wordid = fields[2] surf = fields[1] ufeats = fields[5] misc = fields[9] analysis = '[WORD_ID=%s][UPOS=%s]%s[GUESS=UDPIPE]' % ( wordid, upos, self._ufeats2omor(ufeats)) return (analysis, float('inf'), misc) def _ufeats2omor(self, ufeats): return '[' + ufeats.replace('|', '][') + ']'
from ufal.udpipe import Model, Pipeline, ProcessingError import os m = Model.load("GE20_STR-v2.udpipe") pipeline = Pipeline(m, 'conllu', Pipeline.NONE, Pipeline.DEFAULT, 'conllu') error = ProcessingError() def parse_text(text, pipeline, error): text = text.replace(' ', '\\s') parsed_text = pipeline.process(text, error) if error.occurred(): print('Error:', error.message) parsed_text = parsed_text.replace('\\s', ' ') return parsed_text def basic_to_ext(basic_synt_text, ext_text): sentences = basic_synt_text.split('\n\n')[:-1] ext_sentences = ext_text.split('\n\n')[:-1] new_sents = [] for i in range(len(sentences)): lines = sentences[i].split('\n') ext_lines = ext_sentences[i].split('\n') new_lines = [] for j in range(len(lines)): if lines[j][0] == '#': new_lines.append(lines[j]) continue parts = lines[j].split('\t') ext_parts = ext_lines[j].split('\t')
class SRE(object): def __init__(self, udmodel, wordModel): self.__udmodel__ = Model.load(udmodel) if self.__udmodel__ is None: raise ValueError('Unknown UDPipe model') self.__pipeline__ = Pipeline( self.__udmodel__, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') self.__uderror__ = ProcessingError() self.__srem__ = WordModel(wordModel) self.__result__ = Result() def __checkerThemes__(self, themes): if isinstance(themes, list): for theme in themes: if not isinstance(theme, str): raise TypeError('Themes list must contain only str objects') else: raise TypeError('Themes must be list of str') def __evalTreeSentence__(self, themes, sentenceRoot): templateParsers = [_class.__name__ for _class in AbstractTemplateParser.__subclasses__()] for className in templateParsers: _class = globals()[className] templateParser = _class(themes, self.__srem__) templateValue = _class.getTemplateName() parseSentenceResult = templateParser.parse(sentenceRoot) self.__result__.add({templateValue: parseSentenceResult}) conjRoots = [_ for _ in sentenceRoot.children if _.token['deprel'] == 'conj'] if conjRoots: for conjRoot in conjRoots: self.__evalTreeSentence__(themes, conjRoot) def analyze(self, themes, filename, encoding='utf8'): self.__checkerThemes__(themes) print('Updating model with text... ', end='') self.__srem__.trainFile(filename, encoding=encoding) print('[OK]') print('Parsing sentences... ', end='') with open(filename, 'r', encoding=encoding) as file: for index, line in enumerate(file, start=1): processed_conllu = self.__pipeline__.process(line, self.__uderror__) if self.__uderror__.occurred(): raise RuntimeError('UDPipe error: ' + self.__uderror__.message) sentence_root = parse_tree(processed_conllu)[0] self.__evalTreeSentence__(themes, sentence_root) print('[OK]') def getFullResult(self, outputMode=Output.TO_FILE, filename='result.txt'): print('Getting full result... ') out = Output(outputMode, filename) out.out(self.__result__.getData()) def getResultByType(self, type, concept=None, outputMode=Output.TO_FILE, filename='result.txt'): templateParsers = [_class.__name__ for _class in AbstractTemplateParser.__subclasses__()] availableTypes = [globals()[_].getTemplateName() for _ in templateParsers] if type in availableTypes: print('Getting result by type ' + type + '... ') out = Output(outputMode, filename) out.out(self.__result__.getDataByType(type, concept)) else: raise AttributeError('Unknown semantic type. Available types: ' + ', '.join(availableTypes))