def main(): args = parse_args() random.seed(args.seed) args = vars(args) print("[Launching identity lemmatizer...]") if args['mode'] == 'train': print( "[No training is required; will only generate evaluation output...]" ) document = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(document, args['batch_size'], args, evaluation=True, conll_only=True) system_pred_file = args['output_file'] gold_file = args['gold_file'] # use identity mapping for prediction preds = batch.doc.get([TEXT]) # write to file and score batch.doc.set([LEMMA], preds) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Lemma score:") print("{} {:.2f}".format(args['lang'], score * 100))
def makeDoc(doc): s = str(type(doc)) if s.find("spacy") == 8: return doc elif s.find("stanza") == 8: from stanza.utils.conll import CoNLL d = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict())) elif s.find("classla") == 8 or s.find("stanfordnlp") == 8: d = doc.conll_file.conll_as_string() elif s.find("nltk") == 8: d = doc.to_conll(10) elif s.find("combo") == 8: from combo.data import sentence2conllu d = sentence2conllu(doc, False).serialize() elif s.find("list") == 8: d = "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc) else: d = str(doc) DOC = [] m = [] misc = "" for t in d.split("\n"): x = t.split("\t") if len(x) != 10: continue try: i, j = int(x[0]), int(x[6]) except: try: i = x[0].index("-") j = int(x[0][0:i]) k = int(x[0][i + 1:]) m.append((len(DOC), j, k, x[1])) continue except: continue s = type("", (object, ), {"i": i}) s.orth_ = x[1] s.pos_ = x[3] s.head = j s.dep_ = x[7] s.whitespace_ = (x[9].find("SpaceAfter=No") < 0) if s.whitespace_: i = x[9].find("start_char=") if i >= 0: j = x[9].find("|", i) k = x[9][i + 5:] if j < 0 else x[9][i + 5:j] if misc.find("end" + k) >= 0: DOC[-1].whitespace_ = False DOC.append(s) misc = x[9] for i, j, k, f in reversed(m): offset = i - DOC[i].i DOC[k + offset].contract = (f, [i + offset for i in range(j, k + 1)]) for i, t in enumerate(DOC): if t.head == 0: t.head = t else: t.head = DOC[i + t.head - t.i] return DOC
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang']) # load model use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']: loaded_args[k] = args[k] # laod data print("Loading data with batch size {}...".format(args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True) # skip eval if dev data does not exist if len(batch) == 0: print("Skip evaluation because no dev data is available...") print("Lemma score:") print("{} ".format(args['lang'])) sys.exit(0) dict_preds = trainer.predict_dict(batch.doc.get([TEXT, UPOS])) if loaded_args.get('dict_only', False): preds = dict_preds else: print("Running the seq2seq model...") preds = [] edits = [] for i, b in enumerate(batch): ps, es = trainer.predict(b, args['beam_size']) preds += ps if es is not None: edits += es preds = trainer.postprocess(batch.doc.get([TEXT]), preds, edits=edits) if loaded_args.get('ensemble_dict', False): print("[Ensembling dict with seq2seq lemmatizer...]") preds = trainer.ensemble(batch.doc.get([TEXT, UPOS]), preds) # write to file and score batch.doc.set([LEMMA], preds) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Lemma score:") print("{} {:.2f}".format(args['lang'], score * 100))
def get_connlu_sentence(self, sentence: str) -> str: processed_sentence = self._preprocess(sentence) doc_response = self._stanford_annotator._annotator(processed_sentence) fp, tmp = tempfile.mkstemp() CoNLL.write_doc2conll(doc_response, tmp) with open(tmp, encoding='utf-8') as f: conll_string = f.read() return conll_string
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] save_name = args['save_name'] if args[ 'save_name'] else '{}_mwt_expander.pt'.format(args['shorthand']) model_file = os.path.join(args['save_dir'], save_name) # load model use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']: loaded_args[k] = args[k] logger.debug('max_dec_len: %d' % loaded_args['max_dec_len']) # load data logger.debug("Loading data with batch size {}...".format( args['batch_size'])) doc = CoNLL.conll2doc(input_file=args['eval_file']) batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True) if len(batch) > 0: dict_preds = trainer.predict_dict( batch.doc.get_mwt_expansions(evaluation=True)) # decide trainer type and run eval if loaded_args['dict_only']: preds = dict_preds else: logger.info("Running the seq2seq model...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) if loaded_args.get('ensemble_dict', False): preds = trainer.ensemble( batch.doc.get_mwt_expansions(evaluation=True), preds) else: # skip eval if dev data does not exist preds = [] # write to file and score doc = copy.deepcopy(batch.doc) doc.set_mwt_expansions(preds) CoNLL.write_doc2conll(doc, system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("MWT expansion score: {} {:.2f}".format( args['shorthand'], score * 100))
def annotate(self, text: str): doc_response = self._annotator(text) fp, tmp = tempfile.mkstemp() CoNLL.write_doc2conll(doc_response, tmp) with open(tmp, encoding='utf-8') as f: conll_string = f.read() return [ self._sentence_to_df(sentence) for sentence in conll_string.split("\n\n") if len(sentence) > 0 ]
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_tagger.pt'.format(args['save_dir'], args['shorthand']) # load pretrain; note that we allow the pretrain_file to be non-existent pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) pretrain = Pretrain(pretrain_file) # load model print("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(pretrain=pretrain, model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand' ] or k == 'mode': loaded_args[k] = args[k] # load data print("Loading data with batch size {}...".format(args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) if len(batch) > 0: print("Start evaluation...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) else: # skip eval if dev data does not exist preds = [] preds = utils.unsort(preds, batch.data_orig_idx) # write to file and score batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x]) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Tagger score:") print("{} {:.2f}".format(args['shorthand'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = model_file_name(args) # load pretrained vectors if needed pretrain = load_pretrain(args) # load model logger.info("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(pretrain=pretrain, model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand' ] or k == 'mode': loaded_args[k] = args[k] # load data logger.info("Loading data with batch size {}...".format( args['batch_size'])) doc = CoNLL.conll2doc(input_file=args['eval_file']) batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) if len(batch) > 0: logger.info("Start evaluation...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) else: # skip eval if dev data does not exist preds = [] preds = utils.unsort(preds, batch.data_orig_idx) # write to file and score batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x]) CoNLL.write_doc2conll(batch.doc, system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("Parser score:") logger.info("{} {:.2f}".format(args['shorthand'], score * 100))
def __init__(self,UniDic,UDPipe): self.UniDic=UniDic if UniDic!=None: d=os.path.join(DOWNLOAD_DIR,UniDic) r=os.path.join(PACKAGE_DIR,"mecabrc") if os.path.isdir(d): try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger self.mecab=Tagger("-r "+r+" -d "+d).parse elif UniDic=="unidic-lite": try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger import unidic_lite self.mecab=Tagger("-r "+r+" -d "+unidic_lite.DICDIR).parse elif UniDic=="ipadic": try: from MeCab import Tagger except: from fugashi import GenericTagger as Tagger try: import ipadic self.mecab=Tagger(ipadic.MECAB_ARGS).parse except: self.mecab=Tagger().parse else: d={ "gendai":"dic1", "spoken":"dic2", "qkana":"dic3", "kindai":"dic4", "kinsei":"dic5", "kyogen":"dic6", "wakan":"dic7", "wabun":"dic8", "manyo":"dic9" } self.dictkey=d[UniDic] self.mecab=self.ChamameWebAPI self.udpipe=self.UDPipeWebAPI if UDPipe==None: self.model="japanese-gsd" else: self.model=UDPipe m=os.path.join(DOWNLOAD_DIR,self.model+".udpipe") if os.path.isfile(m): import ufal.udpipe self.model=ufal.udpipe.Model.load(m) if UniDic==None: self.udpipe=ufal.udpipe.Pipeline(self.model,"tokenizer=presegmented","","","").process else: self.udpipe=ufal.udpipe.Pipeline(self.model,"conllu","none","","").process elif self.model.startswith("stanza_"): import stanza if UniDic==None: self.model=stanza.Pipeline(self.model[7:],verbose=False) from stanza.utils.conll import CoNLL self.udpipe=lambda text:CoNLL.conll_as_string(CoNLL.convert_dict(self.model(text).to_dict())) else: self.model=stanza.Pipeline(self.model[7:],processors="depparse",depparse_pretagged=True,verbose=False) self.udpipe=self.StanzaAPI
def StanzaAPI(self,conllu): d=[] e=[] for s in conllu.split("\n"): if s=="" or s.startswith("#"): if e!=[]: d.append(list(e)) e=[] else: t=s.split("\t") e.append({"id":int(t[0]),"text":t[1],"lemma":t[2],"upos":t[3],"xpos":t[4],"misc":t[9]}) from stanza.models.common.doc import Document from stanza.utils.conll import CoNLL return CoNLL.conll_as_string(CoNLL.convert_dict(self.model(Document(d)).to_dict()))
def test_unusual_misc(): """ The above RUSSIAN_SAMPLE resulted in a blank misc field in one particular implementation of the conll code (the below test would fail) """ doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE) sentences = CoNLL.doc2conll(doc) assert len(sentences) == 1 assert len(sentences[0]) == 14 for word in sentences[0]: pieces = word.split("\t") assert len(pieces) == 1 or len(pieces) == 10 if len(pieces) == 10: assert all(piece for piece in pieces)
def print_conll_sen(sen, sent_id=None, swaps=()): out = f'# sent_id = {sent_id}\n# text = {sen.text}\n' for fields in CoNLL.convert_dict([sen.to_dict()])[0]: for i, j in swaps: fields[i], fields[j] = fields[j], fields[i] out += "\t".join(fields) + '\n' return out
def preprocess_to_stream(corpus_filename, lang): """Pre-process (tokenize, segment) the specified raw text corpus using the Stanford's Stanza library. Args: corpus_filename: Filename of the raw text corpus to pre-process. lang: Language of Stanza model to use for pre-processing. Returns: A stream containing the pre-processed text in CoNLL format. """ stanza_pipeline = stanza.Pipeline(lang=lang, processors='tokenize,mwt', use_gpu=False) with open(corpus_filename, "r") as corpus_file: doc = stanza_pipeline(corpus_file.read()) conll = CoNLL.convert_dict(doc.to_dict()) conll_stream = StringIO() for sent in conll: for token in sent: print("\t".join(token), file=conll_stream) print(file=conll_stream) conll_stream.seek(0) return conll_stream
def check_mwt(filename): """ Checks whether or not there are MWTs in the given conll file """ doc = CoNLL.conll2doc(filename) data = doc.get_mwt_expansions(False) return len(data) > 0
def get_factory(sh, fn): print('Resolving vocab option for {}...'.format(sh)) train_file = 'data/pos/{}.train.in.conllu'.format(sh) if not os.path.exists(train_file): raise UserWarning( 'Training data for {} not found in the data directory, falling back to using WordVocab. To generate the ' 'XPOS vocabulary for this treebank properly, please run the following command first:\n' '\tstanza/utils/datasets/prepare_pos_treebank.py {}'.format( fn, fn)) # without the training file, there's not much we can do key = 'WordVocab(data, shorthand, idx=2)' return key doc = CoNLL.conll2doc(input_file=train_file) data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True) print(f'Original length = {len(data)}') data = filter_data(data, idx=2) print(f'Filtered length = {len(data)}') vocab = WordVocab(data, sh, idx=2, ignore=["_"]) key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])' best_size = len(vocab) - len(VOCAB_PREFIX) if best_size > 20: for sep in ['', '-', '+', '|', ',', ':']: # separators vocab = XPOSVocab(data, sh, idx=2, sep=sep) length = sum( len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values()) if length < best_size: key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep) best_size = length return key
def main(source, target, language): source = Path(source) target = Path(target) # https://stanfordnlp.github.io/stanza/neural_pipeline.html # https://stanfordnlp.github.io/stanza/depparse.html nlp = stanza.Pipeline( lang=language, processors="tokenize,mwt,pos,lemma,depparse", ) # read text file content text = source.read_text() # process text with Stanza doc = nlp(text) # write processed document to CoNLL file CoNLL.write_doc2conll(doc, target)
def parse_conllu(text, english_lines): # stanfordnlp.download('zh') # Download the English models nlp = stanza.Pipeline( "zh", processors='tokenize,pos,lemma,depparse', tokenize_pretokenized=True, tokenize_no_ssplit=True, # tokenize_model_path="/Users/loganpeng/Dropbox/Dissertation/code/stanza-train/stanza/saved_models/tokenize/zh_ontonotes_tokenizer.pt", pos_pretrain_path= "/Users/loganpeng/Dropbox/Dissertation/code/stanza-train/stanza/saved_models/pos/zh_ontonotes.pretrain.pt", pos_model_path= "/Users/loganpeng/Dropbox/Dissertation/code/stanza-train/stanza/saved_models/pos/zh_ontonotes_tagger.pt", depparse_pretrain_path= "/Users/loganpeng/Dropbox/Dissertation/code/stanza-train/stanza/saved_models/depparse/zh_ontonotes.pretrain.pt", depparse_model_path= "/Users/loganpeng/Dropbox/Dissertation/code/stanza-train/stanza/saved_models/depparse/zh_ontonotes_parser.pt" ) # nlp = stanza.Pipeline("zh", processors='tokenize,pos,lemma,depparse', tokenize_pretokenized=True, tokenize_no_ssplit=True) ### nlp = stanza.Pipeline(processors='tokenize,pos,lemma,depparse', lang='zh', tokenize_pretokenized=True, use_gpu=True, pos_batch_size=3000) conllus = [] chapter_initials = [] text_fields = [] for sid, sentence in enumerate(text): # if sid > 10: # continue # Get Chapter initial sentences chapter_initials.append( True if re.match("^[IVX。 ]+$", sentence) else False) # Stanza parse UD doc = nlp(sentence) # Run the pipeline on input text dicts = doc.to_dict() conllu = CoNLL.convert_dict(dicts) text_fields.append("".join([x[1] for x in conllu[0]])) for tokid in range(len(conllu[0])): conllu[0][tokid][3] = xpos2upos_dict[conllu[0][tokid] [3]] # change col3 to upos conllu[0][tokid][9] = "_" # remove start_char and end_char conllu = "\n".join(['\t'.join(x) for x in conllu[0]]) conllus.append(conllu) print('o Done parsing sentence %d/%d' % (sid, len(text)), end="\r") # assert sum(chapter_initials) == 27 # Write conllu string chapter_no = 1 conllu_string = "" for sent_id in range(len(conllus)): if chapter_initials[sent_id]: conllu_string += "# newdoc_id = lpp_1943_zh_ch-%.2d\n" % chapter_no chapter_no += 1 conllu_string += "# sent_id = lpp_1943_zh-%d\n# text = %s\n# en_sent_id = lpp_1943.%d\n# en_text = %s\n" % ( sent_id + 1, text_fields[sent_id], sent_id + 1, english_lines[sent_id]) + conllus[sent_id] + "\n\n" return conllu_string
def main(args): sents = load_conllu(args.input) stanza_nlp = stanza.Pipeline( lang=args.lg, dir=args.model, package=args.treebank, tokenize_no_ssplit=True, use_gpu=args.cuda, ) doc = stanza_nlp(sents) doc_dict = doc.to_dict() conll = CoNLL.convert_dict(doc_dict) doc_conll_str = CoNLL.conll_as_string(conll) with open(args.output, "w") as wf: wf.write(doc_conll_str)
def tokenize_with_stanza(homebrew): import stanza import sqlite3 import os from stanza.utils.conll import CoNLL #stanza.download("en") #Beim ersten Ausführen unbedingt einmal Laufen lassen (benötigt Internetverbindung) nlp = stanza.Pipeline("en") db = sqlite3.connect("Monster.db") #Verbindet die DB c = db.cursor() folder = "./tagged_stanza" #Erstellt einen Ordner um die TSVs zu speichern if not os.path.exists(folder): os.makedirs(folder) if homebrew == True: #Nur für den filenamen wichtig fname_part = "/homebrew_" c.execute("SELECT DISTINCT Beschreibung FROM Homebrew") else: fname_part = "/5eTools_" c.execute( "SELECT Beschreibung, ID FROM Monster WHERE NOT Beschreibung = 'No information available.' AND NOT Beschreibung = ''") # Wählt nur relevante Einträge aus der Tabelle monstertable = c.fetchall() pattern = r'\d+d\d+|initiative|challenge rating|CR\d+|advantage|disadvantage|saving throw| DC |lair action|Enter a description for your Monster here' #Schmeißt Zeilen raus die offensichtlich keine Beschreibung enthalten ID = 0 for monster in monstertable: #Looped über die Ergebnisse des fetchall if homebrew: if len(monster[0]) > 4: #print(len(monster[0])) description = monster[0][2:-2] else: continue else: description = monster[0] ID = monster[1] forstanza = "" lines = description.split(".") for y in lines: if not re.search(pattern, y, re.IGNORECASE) and not y == "": temp = y.replace(u'\\xa0', ' ').encode('utf-8').decode('utf-8', errors='replace') #temp = "".join(y.split()) forstanza = forstanza + temp file = folder + fname_part + str(ID) + ".tsv" with open(file,"w",encoding="UTF-8") as f: if len(forstanza) > 0 and not forstanza == " ": print(forstanza) doc = nlp(forstanza) conll = CoNLL.convert_dict(doc.to_dict()) for sentence in conll: for token in sentence: print("\t".join(token), file=f) ID = ID +1
def process_line(line, nlp): try: doc = nlp(line) conll = CoNLL.convert_dict(doc.to_dict()) return conll[0] except Exception as e: # # print(e) return []
def get_depd_tree( doc: str, lg: str, stanza_model_path: Path, tokenize: bool = True, ssplit: bool = False, cuda: bool = False, verbose: bool = False, ) -> str: logging.info(f"generating SUD parse for the input document") model_dir = str(stanza_model_path) if tokenize and ssplit: stanza_nlp = stanza.Pipeline(lang=lg, dir=model_dir, use_gpu=cuda, verbose=verbose) elif tokenize: stanza_nlp = stanza.Pipeline( lang=lg, dir=model_dir, tokenize_no_ssplit=True, use_gpu=cuda, verbose=verbose, ) else: stanza_nlp = stanza.Pipeline( lang=lg, dir=model_dir, tokenize_pretokenized=True, use_gpu=cuda, verbose=verbose, ) stanza_doc = stanza_nlp(doc) doc_dict = stanza_doc.to_dict() conll = CoNLL.convert_dict(doc_dict) doc_conll_str = CoNLL.conll_as_string(conll) return doc_conll_str
def text2ud(id, nlp, text): res = [] doc = nlp(text) sentences = doc.sentences for i, ud in enumerate(CoNLL.convert_dict(doc.to_dict())): res.append("# sent_id = %s.%d" % (id, i)) res.append("# text = %s" % sentences[i].text) for l in ud: l[9] = "_" # ignore last field (start_char,end_char) res.append("\t".join(l)) res.append("") return res
def test_doc_with_comments(): """ Test that a doc with comments gets converted back with comments """ lines = RUSSIAN_SAMPLE.split("\n") doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE) assert len(doc.sentences) == 1 assert len(doc.sentences[0].comments) == 3 assert lines[0] == doc.sentences[0].comments[0] assert lines[1] == doc.sentences[0].comments[1] assert lines[2] == doc.sentences[0].comments[2] sentences = CoNLL.doc2conll(doc) assert len(sentences) == 1 sentence = sentences[0] assert len(sentence) == 14 assert lines[0] == sentence[0] assert lines[1] == sentence[1] assert lines[2] == sentence[2]
def predict(self, eval_file_or_string): eval_file = _read_conllu_arg(eval_file_or_string, self.feature_config, predict=True) doc = Document(CoNLL.conll2dict(input_file=eval_file)) batch = DataLoader(doc, self.batch_size, self.loaded_args, self.pretrain, vocab=self.vocab, evaluation=True, sort_during_eval=True) preds = [] if len(batch) > 0: for i, b in enumerate(batch): preds += self.trainer.predict(b) preds = utils.unsort(preds, batch.data_orig_idx) batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x]) doc_conll = CoNLL.convert_dict(batch.doc.to_dict()) conll_string = CoNLL.conll_as_string(doc_conll) return conll_string
def lemmatize(lemmatizer, conllu, morphs): def clean_final(text): finals = {"פ":"ף","כ":"ך","מ":"ם","נ":"ן","צ":"ץ"} if text[-1] in finals: text = text[:-1] + finals[text[-1]] return text def post_process(word, pos, lemma, morph): if word == lemma: if word + "\t" + pos in lex: if pos == "VERB" and "Fut" in morph: lemma = lex[word + "\t" + pos] if pos == "VERB" and "Pres" in morph: lemma = lex[word + "\t" + pos] if pos == "VERB" and "Part" in morph: lemma = lex[word + "\t" + pos] if pos in ["NOUN", "ADJ"] and "Plur" in morph: lemma = lex[word + "\t" + pos] else: if "Plur" in morph and pos in ["NOUN", "ADJ"] and ( word.endswith("ים") or word.endswith("ות")): lemma = lemma[:-2] if word.endswith("ות"): lemma += "ה" lemma = clean_final(lemma) return lemma uposed = [[l.split("\t") for l in s.split("\n")] for s in conllu.strip().split("\n\n")] dicts = CoNLL.convert_conll(uposed) for sent in dicts: for tok in sent: tok["id"] = int(tok["id"][0]) doc = Document(dicts) lemmatized = lemmatizer(doc) output = [] counter = 0 for sent in lemmatized.sentences: for tok in sent.tokens: word = tok.words[0] lemma = word.lemma if lemmatizer.do_post_process: lemma = post_process(word.text, word.upos, word.lemma, morphs[counter]) row = [str(word.id), word.text, lemma, word.upos, word.xpos, '_', str(word.head), "_", "_", "_"] output.append("\t".join(row)) counter += 1 output.append("") lemmatized = "\n".join(output) lemmatized = get_col(lemmatized,2) return lemmatized
def test_depparse_with_pretagged_doc(): nlp = stanza.Pipeline( **{ 'processors': 'depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'depparse_pretagged': True }) doc = CoNLL.conll2doc(input_str=EN_DOC_CONLLU_PRETAGGED) processed_doc = nlp(doc) assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join( [sent.dependencies_string() for sent in processed_doc.sentences])
def parse_to_conll(self, fin, nlp_str="stanza"): with open(fin, 'r', encoding="utf-8") as readinput: ri = readinput.read() doc = self.nlp(ri) if nlp_str == "stanza": dicts = doc.to_dict() conll = CoNLL.convert_dict(dicts) elif nlp_str == "udpipe": stc = self._sentences_to_conllu(doc) conll = list(stc) return conll
def prep_conllu(tb, file_path, overwrite): out_file = out_dir.joinpath(file_path.name) if out_file.exists() and not overwrite: print(f"{out_file.name} exists; skipping") return None lang, tb, tb_kwargs = determine_treebank(tb) if not lang: shutil.copy(file_path, out_file) return None doc = Document(CoNLL.conll2dict(input_file=file_path)) nlp = stanza.Pipeline(lang=lang, processors='tokenize,mwt,pos', tokenize_pretokenized=True) doc = nlp.processors['pos'].process(doc) return doc
def extract_stanza_conllu(doc, sentid): ''' doc: a stanza doc object for the entire document sentid: an integer denoting sentence id in document (starts with 0) Output: conllu format string output of the sentence ''' input_dict = doc.to_dict() conll = CoNLL.convert_dict(input_dict) string="" for item in conll[sentid]: string+=("\t".join(item)) string+="\n" return string
def main(): args = arguments() stanza.download(args.language) nlp = stanza.Pipeline(args.language, processors="tokenize,mwt,pos,lemma,depparse") for fh in args.TEXT: filename = os.path.basename(fh.name) text = fh.read() doc = nlp(text) dicts = doc.to_dict() conll = CoNLL.convert_dict(dicts) with open(os.path.join(args.output_dir, filename + ".conllu"), mode="w", encoding="utf-8") as out: for sentence in conll: out.write("\n".join(("\t".join(token) for token in sentence))) out.write("\n\n")