def parse(self, text): # Lazy load model file to speed up startup if not self.model: self.model = self.load_model(self.language) text = text.strip() # Adding a period improves detection on especially short sentences period_added = False last_character = text.strip()[-1] if re.match(r"\w", last_character, flags=re.UNICODE): text += "." period_added = True pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() processed = pipeline.process(text, error) if error.occurred(): raise ParserException(error.message) # Remove the period to make sure input corresponds to output if period_added: processed = "\n".join(processed.rstrip().split("\n")[:-1]) + "\n\n" return processed
def run(model_file, text_file): print('Loading model...') model = Model.load(model_file) pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') print('Reading corpus...') with open(text_file) as f: text = f.read() print('Analyzing text...') processed = pipeline.process(text) print('Extracting phrases...') phrases = [] sent = [] for line in tqdm((processed + '#').splitlines()): if line.startswith('#') and len(sent): preps = get_preps(sent) for prep, dep_id in preps.values(): pphrase = get_phrase(prep, dep_id, sent) phrases.append(pphrase) sent.clear() elif len(line) > 1: try: sent.append(Token(line.split('\t'))) except ValueError: continue print('Done!') return phrases
def tag_ud(text='Текст нужно передать функции в виде строки!', modelfile='udpipe_syntagrus.model'): model = Model.load(modelfile) pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') processed = pipeline.process( text) # обрабатываем текст, получаем результат в формате conllu output = [l for l in processed.split('\n') if not l.startswith('#') ] # пропускаем строки со служебной информацией tagged = [ w.split('\t')[2].lower() + '_' + w.split('\t')[3] for w in output if w ] # извлекаем из обработанного текста лемму и тэг # tagged_propn = [] # propn = [] # for t in tagged: # if t.endswith('PROPN'): # if propn: # propn.append(t) # else: # propn = [t] # else: # if len(propn) > 1: # for x in propn: # #name = '::'.join([x.split('_')[0] for x in propn]) + '_PROPN' # tagged_propn.append(x) # elif len(propn) == 1: # tagged_propn.append(propn[0]) # tagged_propn.append(t) # propn = [] return tagged
def udpipeS(pathmodel, sourcepath, pathdestination): model = Model.load(pathmodel) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % pathmodel) sys.exit(1) sys.stderr.write('done\n') pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() i = 1 for filename in os.listdir(sourcepath): f = open(pathdestination + filename[:-3] + "conllu", "a") f.truncate(0) text = io.open(sourcepath + filename, "r", encoding="utf-8") string = "".join(text.readlines()) # Process data processed = pipeline.process(string, error) if error.occurred(): sys.stderr.write("An error occurred when running run_udpipe: ") sys.stderr.write(error.message) sys.stderr.write("\n") sys.exit(1) f.write(processed) f.close() print("File n ", i, " processed of ", len(os.listdir(sourcepath))) i += 1
def udpipeG(pathmodel): path = "/home/guido/Progetto Unitexto/textdata/cleanedTxt/" model = Model.load(pathmodel) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % pathmodel) sys.exit(1) sys.stderr.write('done\n') pipeline = Pipeline(model, "horizontal", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() # corp = io.open("/home/guido/Progetto Unitexto/textdata/corpus.txt","r",encoding= "utf-8") # Read whole input # string="".join(corp.readlines()) # Process data # processed = pipeline.process(string, error) f = open("/home/guido/Progetto Unitexto/textdata/corpus.conllu", "a") f.truncate(0) i = 1 for filename in os.listdir(path): text = io.open(path + filename, "r", encoding="utf-8") string = "".join(text.readlines()) # Process data processed = pipeline.process(string, error) if error.occurred(): sys.stderr.write("An error occurred when running run_udpipe: ") sys.stderr.write(error.message) sys.stderr.write("\n") sys.exit(1) f.write(processed) print("File n ", i, " processed of ", len(os.listdir(path))) i += 1
def __call__(self, *argv): """Performs tokenization, tagging, lemmatizing and parsing. Args: text(str): text. OR tokens(list): List of Token objects. sentences(list): List of Sentence objects. Returns: Dictionary that contains: 1. tokens - list of objects Token. 2. sentences - list of objects Sentence. 3. lemma - list of lists of strings that represent lemmas of words. 4. postag - list of lists of strings that represent postags of words. 5. morph - list of lists of strings that represent morphological features. 6. syntax_dep_tree - list of lists of objects WordSynt that represent a dependency tree. """ assert self.model if type(argv[0]) == str: self.TOKENIZER = 'generic_tokenizer' self.pipeline = Pipeline(self.model, self.TOKENIZER, self.tagger, self.parser, 'conllu') return self.process_text(argv[0]) self.TOKENIZER = 'horizontal' self.pipeline = Pipeline(self.model, self.TOKENIZER, self.tagger, self.parser, 'conllu') return self.process_tokenized(argv[0], argv[1])
def udpipe(sentences, model_name, verbose=False): """ Parse text to Universal Dependencies using UDPipe. :param sentences: iterable of iterables of strings (one string per line) :param model_name: filename containing UDPipe model to load :param verbose: print extra information :return: iterable of lines containing parsed output """ from ufal.udpipe import Model, Pipeline, ProcessingError model = Model.load(model_name) if not model: raise ValueError("Invalid model: '%s'" % model_name) pipeline = Pipeline(model, "conllu", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") lines1, lines2 = tee(l for s in sentences for l in s) text = "\n".join(lines1) error = ProcessingError() num_tokens = sum(1 for l in lines2 if l) with ioutil.external_write_mode(): print("Running %s on %d tokens... " % (model_name, num_tokens), end="", flush=True) start = time() processed = pipeline.process(text, error) duration = time() - start with ioutil.external_write_mode(): print("Done (%.3fs, %.0f tokens/s)" % (duration, num_tokens / duration if duration else 0)) if verbose: print(processed) if error.occurred(): raise RuntimeError(error.message) return processed.splitlines()
def load_file(self, name, filename, lang): if filename not in tronco_special_files: filename_dir = os.path.join(root_path, "corpora", name, filename) if not lang in self.models: self.models[lang] = Model.load( os.path.join(root_path, "udpipe", udpipe_models[lang]['path'])) pipeline = Pipeline(self.models[lang], "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") with open(filename_dir) as f: try: text = f.read().splitlines() except: return False raw_text = [] metadata = {'filename': filename} [ metadata.update({ x.split(" = ", 1)[0].split("# ", 1)[1]: x.split(" = ", 1)[1] }) if x.strip().startswith("# ") and " = " in x else raw_text.append(x) for x in text ] if not name in self.files: self.files[name] = {} self.files[name][filename] = pipeline.process( "\n".join(raw_text)).replace("# newdoc\n", "").replace("# newpar\n", "") if not name in self.metadata: self.metadata[name] = {} self.metadata[name][filename] = metadata
def parse(self, text): # Lazy load model file to speed up startup if not self.model: self.model = self.load_model() text = text.strip() # Adding a period improves detection on especially short sentences period_added = False last_character = text.strip()[-1] if re.match(r"\w", last_character, flags=re.UNICODE): text += "." period_added = True pipeline = Pipeline( self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu" ) error = ProcessingError() processed = pipeline.process(text, error) if error.occurred(): raise ParserException(error.message) # Remove the period to make sure input corresponds to output if period_added: processed = "\n".join(processed.rstrip().split("\n")[:-1]) + "\n\n" return processed
def process(self, text: 'str') -> 'Scene': """ Processes the description and builds a scene based on it. Parameters ---------- text : str The description of the scene. Returns ------- Scene The scene described by the text. """ text_preprocessed = self._preprocess(text) pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() processed = pipeline.process(text_preprocessed, error) parsed = conllu.parse(processed) scene = self._traverse_tree(parsed) return scene
def make_conll_with_udpipe(text): model_path = os.path.join(os.getcwd(), 'udparsers', 'russian-syntagrus-ud-2.5-191206.udpipe' ) # здесь указать путь к модели model = Model.load(model_path) pipeline = Pipeline(model, 'tokenizer=ranges', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') return pipeline.process(text)
def make_conll_with_udpipe(text, language='german'): if language == 'german': model_path = path.join('..', '..', 'udpipe', 'german-ud-2.0-170801.udpipe') model = Model.load(model_path) pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') return pipeline.process(text)
def __init__(self): print('Loading model: ') model_path = r"D:\py_projects\IWonnaBook\udpipe_syntagrus.model" self.model = Model.load(model_path) if not self.model: print('Модель не загружена :(') sys.exit(1) self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") print('done\n')
def wordToInf(self, text): process_pipeline = Pipeline(self.modelForInfinitive, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') wordInfo = process_pipeline.process(text).split('\n')[4].split('\t') if (wordInfo[3] == 'NUM'): return ('_NUM_' + ('x' * len(wordInfo[2]))) else: return wordInfo[2]
def __init__(self, seed=42): self.morph = pymorphy2.MorphAnalyzer() self.model = Model.load("data/udpipe_syntagrus.model".encode()) self.process_pipeline = Pipeline(self.model, 'tokenize'.encode(), Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu'.encode()) self.seed = seed self.init_seed() self.paronyms = self.get_paronyms() self.freq_bigrams = self.open_freq_grams()
def get(modelAdd, text): from ufal.udpipe import Model, Pipeline, ProcessingError error = ProcessingError() model = Model.load(modelAdd) pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") parsedArticle = pipeline.process(text, error) return parsedArticle
def __init__(self, lang="en"): model_file = "/Users/sxs149331/PycharmProjects/UniversalPetrarch-master/" + self.models[ lang] print model_file self.model = Model.load(model_file) if not self.model: sys.stderr.write("Model Loading Failed") sys.exit(1) sys.stderr.write('done\n') self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
def extract_sentences(input_file: str, output_file: str, logger) -> None: logger.info( "==== Now performing sentence extraction from paragraphs file ====") # UDPipe initliazation lang_model = 'lang_models/czech-ud-2.0-170801.udpipe' model = Model.load(lang_model) if not model: logger.error('Could not load UDPipe language model: ' + lang_model) ud_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, '') ud_error = ProcessingError() sentences_file = open(output_file, "w") # reopen paragraphs for reading paragraphs_file = open(input_file, "r") sentences_count = 0 for p_line in paragraphs_file: page_first_sentence = "" page_first_paragraph = p_line.split( '\t', 1) # use the variable as temporary list # If there is a paragraph content if len(page_first_paragraph) == 2: page_uri = page_first_paragraph[0] page_first_paragraph = page_first_paragraph[1] # Extract first sentence form paragraph using UDPipe: ud_output = ud_pipeline.process(page_first_paragraph, ud_error) if ud_error.occurred(): logger.error( 'Error occured while extracting sentence using UDPipe: ' + ud_error.message) page_first_sentence = "" else: ud_output = ud_output.split('\n') if len(ud_output) >= 4: page_first_sentence = ud_output[3][ 9:] # assumption about the output format else: page_first_sentence = "" # Write sentence to the file sentences_file.write(page_uri + '\t' + page_first_sentence + '\n') sentences_count += 1 if sentences_count % 2000 == 0: logger.info("Extracted {} sentences.".format(sentences_count)) logger.info("Finished extraction of {} sentences.".format(sentences_count)) paragraphs_file.close() sentences_file.close()
def load_udpipe(self, filename): if not can_udpipe: print("importing udpipe failed, cannot load udpipe xxx") return self.udpiper = Model.load(filename) # use pipeline for now, ugly but workable self.udpipeline = Pipeline(self.udpiper, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') self.uderror = ProcessingError() self.can_udpipe = True
def __init__(self, udpipe_path, keep_pos=1, keep_punct=0, keep_stops=1): self.stop_pos = { 'AUX', 'NUM', 'DET', 'PRON', 'ADP', 'SCONJ', 'CCONJ', 'INTJ', 'PART', 'X' } self.udpipe_model = Model.load(udpipe_path) self.pipeline = Pipeline(self.udpipe_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') self.keep_pos = keep_pos self.keep_punct = keep_punct self.keep_stops = keep_stops
def __init__(self, speller: Optional[Speller] = None): self.udpipe_model = Model.load( sber_encode(sber_path('/var/sberbank/udpipe_syntagrus.model'))) self.process_pipeline = Pipeline(self.udpipe_model, sber_encode('tokenize'), Pipeline.DEFAULT, Pipeline.DEFAULT, sber_encode('conllu')) if speller is None: speller = Speller() self.speller: Speller = speller
def run_udpipe(self, path_to_model, sents=None): if sents is None: sents = self.sents verticals = self._to_vertical(sents) model = Model.load(path_to_model) pipeline = Pipeline(model, "vertical", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu") error = ProcessingError() conllu = pipeline.process(verticals, error) return conllu
def __init__(self, seed=42): self.morph = pymorphy2.MorphAnalyzer() self.model = Model.load( sber_encode(sber_path('/var/sberbank/udpipe_syntagrus.model'))) self.process_pipeline = Pipeline(self.model, sber_encode('tokenize'), Pipeline.DEFAULT, Pipeline.DEFAULT, sber_encode('conllu')) self.seed = seed self.init_seed() self.paronyms = self.get_paronyms() self.freq_bigrams = self.open_freq_grams()
def __init__(self): ud_model_path = '../models/udpipe_syntagrus.model' self.ud_model = Model.load(ud_model_path) w2v_model_path = '../models/ruscorpora_upos_skipgram_300_5_2018.vec.gz' self.w2v_model = gensim.models.KeyedVectors.load_word2vec_format( w2v_model_path) self.process_pipeline = Pipeline(self.ud_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
def __init__(self, modeldir): self.modeldir = os.path.abspath(modeldir) self.model = Model.load(self.modeldir) self.error = ProcessingError() if not self.model: sys.stderr.write("Udpipe language model loading failed:" + self.modeldir) sys.exit(1) self.pipeline = Pipeline(self.model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu")
def __init__(self, udmodel, wordModel): self.__udmodel__ = Model.load(udmodel) if self.__udmodel__ is None: raise ValueError('Unknown UDPipe model') self.__pipeline__ = Pipeline( self.__udmodel__, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') self.__uderror__ = ProcessingError() self.__srem__ = WordModel(wordModel) self.__result__ = Result()
def process_sentence(self, sen, field_names=None): pipeline = Pipeline(self._model, self._inp_format, self._pos_settings, self._parse_settings, 'conllu') error = ProcessingError() # For catching errors... inp_sen = ''.join(self._encode_sentence(sen, field_names)) # Do the processing... + Write the output in CoNLL-U processed = pipeline.process(inp_sen, error) if error.occurred(): raise UDPipeError(error.message) ret_sen = self._decode_sentence(processed, sen, field_names) return ret_sen
def parse(sentence): sys.argv.append('tokenize') sys.argv.append('conllu') sys.argv.append('russian-syntagrus-ud-2.4-190531.udpipe') model = Model.load(sys.argv[3]) pipeline = Pipeline(model, sys.argv[1], Pipeline.DEFAULT, Pipeline.DEFAULT, sys.argv[2]) error = ProcessingError() # small preproccessing step sentence = re.sub('«', '« ', sentence) sentence = re.sub('»', '» ', sentence) parsed = pipeline.process(sentence, error) print(parsed) return parsed
def tag_parse_tree(self, root): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') in_data = " ".join([n.form for n in root.descendants]) out_data = pipeline.process(in_data, self.error) if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() nodes = [root] + root.descendants for parsed_node in parsed_root.descendants: node = nodes[parsed_node.ord] node.parent = nodes[parsed_node.parent.ord] for attr in 'upos xpos lemma feats'.split(): setattr(node, attr, getattr(parsed_node, attr))
def load_nli_data(path, snli=False, udpipe_path=None, seq_length=50, r=10, cache_file=''): """ Load MultiNLI or SNLI data. If the 'snli' parameter is set to True, a genre label of snli will be assigned to the data. """ global is_snli, pipeline, error, pr_seq_length, pr_r is_snli = snli pr_r = r pr_seq_length = seq_length pipeline = None print(path) if os.path.exists(cache_file): with open(cache_file, 'rb') as f: data = [w for w in pickle.load(f) if w is not None] else: if udpipe_path: model = Model.load(udpipe_path) pipeline = Pipeline(model, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') error = ProcessingError() with open(path) as f: pool = Pool(32) data = pool.map_async(process_line, list(f), chunksize=1) while not data.ready(): print('{} lines left'.format(data._number_left)) time.sleep(10) data = [w for w in data.get() if w is not None] pool.close() random.seed(1) random.shuffle(data) with open(cache_file, 'wb') as f: pickle.dump(data, f) return data
def tag_parse_tree(self, root): """Tag (+lemmatize, fill FEATS) and parse a tree (already tokenized).""" descendants = root.descendants if not descendants: return pipeline = Pipeline(self.tool, 'horizontal', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') in_data = " ".join([n.form for n in descendants]) out_data = pipeline.process(in_data, self.error) if self.error.occurred(): raise IOError("UDPipe error " + self.error.message) self.conllu_reader.files.filehandle = io.StringIO(out_data) parsed_root = self.conllu_reader.read_tree() nodes = [root] + descendants for parsed_node in parsed_root.descendants: node = nodes[parsed_node.ord] node.parent = nodes[parsed_node.parent.ord] for attr in 'upos xpos lemma feats'.split(): setattr(node, attr, getattr(parsed_node, attr))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d','--from_dir_prefix', type = str) parser.add_argument('-t','--to_dir_prefix', default = '/tmp/workers', type = str) parser.add_argument('-v','--to_vocab_file', default = '/tmp/workers/vocab.txt', type = str) parser.add_argument('-u','--path2udp_model', default = './russian-syntagrus-ud-2.0-170801.udpipe', type = str) parser.add_argument('-n','--cpu_n', default = 5, type = int) parser.add_argument('-T','--timeout_duration', default = 40*60, type = int) # timeout_duration = 40 min args = parser.parse_args() os.makedirs(args.to_dir_prefix, exist_ok=True) model = Model.load(args.path2udp_model) udpipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'horizontal') files = list(glob(args.from_dir_prefix)) word_counts = multipd.timeouted_run_pool(files,udpipeline,args.to_dir_prefix, cpu_n=args.cpu_n, timeout_duration=args.timeout_duration) word_count = sum(word_counts, collections.Counter()) vocab = [ '%s\n' % word for word, _ in word_count.most_common()] vocab.append(word_count.most_common()[-1][0]) open(args.to_vocab_file, 'wt').writelines(vocab) freqs = [ '{}\t{}\n'.format(word, freq) for word, freq in word_count.most_common()[:-1]] open(args.to_vocab_file+'.freqs', 'wt').writelines(freqs)