def test_readonly(): Document.add_property('some_property', 123) nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en') doc = nlp(EN_DOC) assert doc.some_property == 123 with pytest.raises(ValueError): doc.some_property = 456
def __init__(self, data, word2idx, tolower=True): super(GraphData, self).__init__() g_p = utils.doc2graph(Document(data[config.pf])) g_h = utils.doc2graph(Document(data[config.hf])) self.edge_index_p = g_p.edge_index self.edge_index_h = g_h.edge_index #print(g_p.node_attr) # care [ROOT] [UNK] should not get lower!!! if tolower == True: self.x_p = torch.tensor([ word2idx[w.lower() if w[0] != "[" or w[-1] != "]" else w] for w in g_p.node_attr ], dtype=torch.long) self.x_h = torch.tensor([ word2idx[w.lower() if w[0] != "[" or w[-1] != "]" else w] for w in g_h.node_attr ], dtype=torch.long) else: print("not to lower") self.x_p = torch.tensor([word2idx[w] for w in g_p.node_attr], dtype=torch.long) self.x_h = torch.tensor([word2idx[w] for w in g_h.node_attr], dtype=torch.long) label_onehot = torch.zeros([1, config.NUM_CLASSES]) label_onehot[0][data[config.lf]] = 1 #label_onehot = label_onehot.squeeze() #print(label_onehot.size()) self.label = label_onehot.to(dtype=torch.float) self.pid = data[config.idf]
def check_mwt(filename): """ Checks whether or not there are MWTs in the given conll file """ doc = Document(CoNLL.conll2dict(filename)) data = doc.get_mwt_expansions(False) return len(data) > 0
def __init__(self, data, word2idx, tolower=True): super(GraphData, self).__init__() g_p = utils.doc2graph(Document(data[config.pf])) g_h = utils.doc2graph(Document(data[config.hf])) self.edge_index_p = g_p.edge_index self.edge_index_h = g_h.edge_index #print(g_p.node_attr) # care [ROOT] [UNK] should not get lower!!! if tolower == True: self.x_p = torch.tensor([ word2idx[w.lower() if w[0] != "[" or w[-1] != "]" else w] for w in g_p.node_attr ], dtype=torch.long) self.x_h = torch.tensor([ word2idx[w.lower() if w[0] != "[" or w[-1] != "]" else w] for w in g_h.node_attr ], dtype=torch.long) else: print("not to lower") self.x_p = torch.tensor([word2idx[w] for w in g_p.node_attr], dtype=torch.long) self.x_h = torch.tensor([word2idx[w] for w in g_h.node_attr], dtype=torch.long) self.label = data[config.lf] self.pid = data[config.idf]
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand']) # load model use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']: loaded_args[k] = args[k] logger.debug('max_dec_len: %d' % loaded_args['max_dec_len']) # load data logger.debug("Loading data with batch size {}...".format( args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True) if len(batch) > 0: dict_preds = trainer.predict_dict( batch.doc.get_mwt_expansions(evaluation=True)) # decide trainer type and run eval if loaded_args['dict_only']: preds = dict_preds else: logger.info("Running the seq2seq model...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) if loaded_args.get('ensemble_dict', False): preds = trainer.ensemble( batch.doc.get_mwt_expansions(evaluation=True), preds) else: # skip eval if dev data does not exist preds = [] # write to file and score doc = copy.deepcopy(batch.doc) doc.set_mwt_expansions(preds) CoNLL.dict2conll(doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("MWT expansion score: {} {:.2f}".format( args['shorthand'], score * 100))
def test_dict_to_doc_and_doc_to_dict(): doc = Document(DICT) dicts = doc.to_dict() dicts_tupleid = [] for sentence in dicts: items = [] for item in sentence: item['id'] = item['id'] if isinstance(item['id'], tuple) else (item['id'], ) items.append(item) dicts_tupleid.append(items) assert dicts_tupleid == DICT
def _create_stanza_document(sentence_dicts: List[List[Dict[str, str]]], document_text: str) -> Document: stanza_document = Document(sentence_dicts, text=document_text) contains_entities = False for sentence_index, sentence_dict in enumerate(sentence_dicts): first_token = sentence_dict[0] sentence_sentiment = first_token["sentence_sentiment"] if sentence_sentiment is not None: stanza_document.sentences[ sentence_index].sentiment = sentence_sentiment if "ner" in first_token: contains_entities = True if contains_entities: stanza_document.build_ents() return stanza_document
def process(self, doc): """ Run language detection on a string, a Document, or a list of either, route to language specific pipeline """ # only return a list if given a list singleton_input = not isinstance(doc, list) if singleton_input: docs = [doc] else: docs = doc if docs and isinstance(docs[0], str): docs = [Document([], text=text) for text in docs] # run language identification docs_w_langid = self.lang_id_pipeline.process(docs) # create language specific batches, store global idx with each doc lang_batches = {} for doc in docs_w_langid: if doc.lang not in lang_batches: lang_batches[doc.lang] = [] lang_batches[doc.lang].append(doc) # run through each language, submit a batch to the language specific pipeline for lang in lang_batches.keys(): self._update_pipeline_cache(lang) self.pipeline_cache[lang](lang_batches[lang]) # only return a list if given a list if singleton_input: return docs_w_langid[0] else: return docs_w_langid
def _process_list(self, docs): """ Identify language of list of strings or Documents """ if len(docs) == 0: # TO DO: what standard do we want for bad input, such as empty list? # TO DO: more handling of bad input return if isinstance(docs[0], str): docs = [Document([], text) for text in docs] docs_by_length = {} for doc in docs: text = LangIDProcessor.clean_text( doc.text) if self._clean_text else doc.text doc_length = len(text) if doc_length not in docs_by_length: docs_by_length[doc_length] = [] docs_by_length[doc_length].append((doc, text)) for doc_length in docs_by_length: inputs = [doc[1] for doc in docs_by_length[doc_length]] predictions = self._id_langs(self._text_to_tensor(inputs)) for doc, lang in zip(docs_by_length[doc_length], predictions): doc[0].lang = lang return docs
def test_xpos_attribute(): doc = Document(TEST_ONE_SENTENCE) response = semgrex.process_doc( doc, "{tag:NNP}=source <=zzz {word:Opal}=target") check_response(response, response_len=1, source_index=2, reln='compound') response = semgrex.process_doc( doc, "{pos:NNP}=source <=zzz {word:Opal}=target") check_response(response, response_len=1, source_index=2, reln='compound')
def doc2graph(doc, word2idx=None): """ 2020/8/4 18:30 input Stanza Document : doc output PytorchGeoData : G G = { x: id tensor edge_idx : edges size = (2, l-1) edge_attr: (u, v, edge_type in str) node_attr: text } """ if isinstance(doc, list): #convert to Doc first if is in dict form ([[dict]]) doc = Document(doc) # add root token for each sentences e = [[], []] edge_info = [] node_info = [] prev_token_sum = 0 prev_root_id = 0 cur_root_id = 0 # get original dependency for idx, sent in enumerate(doc.sentences): sent.print_dependencies # node info by index(add root at the beginning of every sentence) cur_root_id = len(node_info) node_info.append("[ROOT]") for token in sent.tokens: node_info.append(token.to_dict()[0]['text']) # edge info by index of u in edge (u,v) for dep in sent.dependencies: id1 = prev_token_sum + int(dep[0].to_dict()["id"]) id2 = prev_token_sum + int(dep[2].to_dict()["id"]) e[0].append(id1) e[1].append(id2) edge_info.append((id1, id2, dep[1])) prev_token_sum += len(sent.tokens) + 1 # add links between sentence roots if (cur_root_id != 0): id1 = prev_root_id id2 = cur_root_id e[0].append(id1) e[1].append(id2) edge_info.append((id1, id2, "bridge")) prev_root_id = cur_root_id # id to embeddings # x = torch.tensor([ for token in node_attr]) # done building edges and nodes if word2idx == None: # print("x is not id now, node info is in node_attr as text") x = torch.tensor(list(range(doc.num_tokens + len(doc.sentences)))) else: x = torch.tensor([word2idx[token] for token in node_info]) e = torch.tensor(e) G = Data(x=x, edge_index=e, edge_attr=edge_info, node_attr=node_info) return G
def test_dict_to_doc_and_doc_to_dict(): """ Test the conversion from raw dict to Document and back This code path will first turn start_char|end_char into start_char & end_char fields in the Document That version to a dict will have separate fields for each of those Finally, the conversion from that dict to a list of conll entries should convert that back to misc """ doc = Document(DICT) dicts = doc.to_dict() dicts_tupleid = [] for sentence in dicts: items = [] for item in sentence: item['id'] = item['id'] if isinstance(item['id'], tuple) else (item['id'], ) items.append(item) dicts_tupleid.append(items) conll = CoNLL.convert_dict(DICT) assert conll == CONLL
def __init__(self, data): super(GraphData, self).__init__() # graph(edge) info, does not care batch g_p = utils.doc2graph(Document(data[config.pf])) g_h = utils.doc2graph(Document(data[config.hf])) self.edge_index_p = g_p.edge_index self.edge_index_h = g_h.edge_index self.edge_attr_p = g_p.edge_attr self.edge_attr_h = g_h.edge_attr # node info, care batch self.node_attr_p = g_p.node_attr self.node_attr_h = g_h.node_attr # one-hot label (1*num_classes), direct in batch first form label_onehot = torch.zeros([1, config.NUM_CLASSES]) label_onehot[0][data[config.lf]] = 1 self.label = label_onehot.to(dtype=torch.float) # problem id, direct in batch first form self.pid = data[config.idf]
def bulk_process(self, docs): """ Most processors operate on the sentence level, where each sentence is processed independently and processors can benefit a lot from the ability to combine sentences from multiple documents for faster batched processing. This is a transparent implementation that allows these processors to batch process a list of Documents as if they were from a single Document. """ if hasattr(self, '_variant'): return self._variant.bulk_process(docs) combined_sents = [sent for doc in docs for sent in doc.sentences] combined_doc = Document([]) combined_doc.sentences = combined_sents combined_doc.num_tokens = sum(doc.num_tokens for doc in docs) combined_doc.num_words = sum(doc.num_words for doc in docs) self.process(combined_doc) # annotations are attached to sentence objects return docs
def test_bulk(pipeline): NUM_DOCS = 10 raw_text = [FR_MWT_SENTENCE] * NUM_DOCS raw_doc = [Document([], text=doccontent) for doccontent in raw_text] result = pipeline(raw_doc) assert len(result) == NUM_DOCS for doc in result: compare_ignoring_whitespace(str(doc), EXPECTED_RESULT) assert len(doc.sentences) == 1 assert doc.num_words == 26 assert doc.num_tokens == 24
def lemmatize(lemmatizer, conllu, morphs): def clean_final(text): finals = {"פ":"ף","כ":"ך","מ":"ם","נ":"ן","צ":"ץ"} if text[-1] in finals: text = text[:-1] + finals[text[-1]] return text def post_process(word, pos, lemma, morph): if word == lemma: if word + "\t" + pos in lex: if pos == "VERB" and "Fut" in morph: lemma = lex[word + "\t" + pos] if pos == "VERB" and "Pres" in morph: lemma = lex[word + "\t" + pos] if pos == "VERB" and "Part" in morph: lemma = lex[word + "\t" + pos] if pos in ["NOUN", "ADJ"] and "Plur" in morph: lemma = lex[word + "\t" + pos] else: if "Plur" in morph and pos in ["NOUN", "ADJ"] and ( word.endswith("ים") or word.endswith("ות")): lemma = lemma[:-2] if word.endswith("ות"): lemma += "ה" lemma = clean_final(lemma) return lemma uposed = [[l.split("\t") for l in s.split("\n")] for s in conllu.strip().split("\n\n")] dicts = CoNLL.convert_conll(uposed) for sent in dicts: for tok in sent: tok["id"] = int(tok["id"][0]) doc = Document(dicts) lemmatized = lemmatizer(doc) output = [] counter = 0 for sent in lemmatized.sentences: for tok in sent.tokens: word = tok.words[0] lemma = word.lemma if lemmatizer.do_post_process: lemma = post_process(word.text, word.upos, word.lemma, morphs[counter]) row = [str(word.id), word.text, lemma, word.upos, word.xpos, '_', str(word.head), "_", "_", "_"] output.append("\t".join(row)) counter += 1 output.append("") lemmatized = "\n".join(output) lemmatized = get_col(lemmatized,2) return lemmatized
def StanzaAPI(self,conllu): d=[] e=[] for s in conllu.split("\n"): if s=="" or s.startswith("#"): if e!=[]: d.append(list(e)) e=[] else: t=s.split("\t") e.append({"id":int(t[0]),"text":t[1],"lemma":t[2],"upos":t[3],"xpos":t[4],"misc":t[9]}) from stanza.models.common.doc import Document from stanza.utils.conll import CoNLL return CoNLL.conll_as_string(CoNLL.convert_dict(self.model(Document(d)).to_dict()))
def prep_conllu(tb, file_path, overwrite): out_file = out_dir.joinpath(file_path.name) if out_file.exists() and not overwrite: print(f"{out_file.name} exists; skipping") return None lang, tb, tb_kwargs = determine_treebank(tb) if not lang: shutil.copy(file_path, out_file) return None doc = Document(CoNLL.conll2dict(input_file=file_path)) nlp = stanza.Pipeline(lang=lang, processors='tokenize,mwt,pos', tokenize_pretokenized=True) doc = nlp.processors['pos'].process(doc) return doc
def _encode_parse(sen, field_names) -> Document: """ Converts from xtsv sentence to Stanza Document :param sen: An xtsv sentence :param field_names: Field names :return: Stanza Document containing one sentence """ stanza_sentence = [{ 'id': i, 'text': line[field_names['form']], 'lemma': line[field_names['lemma']], 'upos': line[field_names['upostag']], 'feats': line[field_names['feats']], } for i, line in enumerate(sen, start=1)] return Document([stanza_sentence])
def retag_trees(trees, pipeline, xpos=True): """ Retag all of the trees using the given processor Returns a list of new trees """ sentences = [] for tree in trees: tokens = [{TEXT: pt.children[0].label} for pt in tree.preterminals()] sentences.append(tokens) doc = Document(sentences) doc = pipeline(doc) if xpos: tag_lists = [[x.xpos for x in sentence.words] for sentence in doc.sentences] else: tag_lists = [[x.upos for x in sentence.words] for sentence in doc.sentences] new_trees = [replace_tags(tree, tags) for tree, tags in zip(trees, tag_lists)] return new_trees
def doc(sentences_dict): doc = Document(sentences_dict) return doc
def test_two_semgrex(): doc = Document(TEST_ONE_SENTENCE) response = semgrex.process_doc(doc, "{}=source >obj=zzz {}=target", "{}=source >obj=zzz {}=target") check_response(response, semgrex_len=2)
def test_two_sentences(): doc = Document(TEST_TWO_SENTENCES) response = semgrex.process_doc(doc, "{}=source >obj=zzz {}=target") check_response(response, response_len=2)
def test_lemma_attribute(): doc = Document(TEST_ONE_SENTENCE) response = semgrex.process_doc( doc, "{lemma:Mox}=source <=zzz {lemma:Opal}=target") check_response(response, response_len=1, source_index=2, reln='compound')
out_file = out_dir.joinpath(file_path.name) write_doc_to_file(doc, out_file) else: out_train_file = out_dir.joinpath(file_path.name) dev_name = file_path.name.split("-")[0] + "-ud-dev.conllu" out_dev_file = out_dir.joinpath(dev_name) out_dev_file.touch() sents = doc.to_dict() permutations = np.random.permutation(np.arange(len(sents))).tolist() divide = len(sents) * 7 // 8 train_sents = [sents[idx] for idx in permutations[:divide]] dev_sents = [sents[idx] for idx in permutations[divide:]] train_doc = Document(train_sents) write_doc_to_file(train_doc, out_train_file) dev_doc = Document(dev_sents) write_doc_to_file(dev_doc, out_dev_file) elif args.split == "test": kwargs = {} if args.case == "preprocess": kwargs['processors'] = 'tokenize,mwt,pos,lemma' if not args.lang_file: txt_files = list(ud_root.glob("*/*_*.txt")) assert len(txt_files) == 82, f"number of txt_files = {len(txt_files)}; should be 82" else: with open(args.lang_file) as fp: tbs = fp.read().splitlines()
def test_ner_attribute(): doc = Document(TEST_ONE_SENTENCE) response = semgrex.process_doc( doc, "{cpos:PROPN}=source <=zzz {ner:GEM}=target") check_response(response, response_len=1, source_index=2, reln='compound')
def train(args): # load data logger.debug('max_dec_len: %d' % args['max_dec_len']) logger.debug("Loading data with batch size {}...".format( args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False) vocab = train_batch.vocab args['vocab_size'] = vocab.size dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True) utils.ensure_dir(args['save_dir']) model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_mwt_expander.pt'.format(args['save_dir'], args['shorthand']) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.warning("Skip training because no data available...") return # train a dictionary-based MWT expander trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda']) logger.info("Training dictionary-based MWT expander...") trainer.train_dict(train_batch.doc.get_mwt_expansions(evaluation=False)) logger.info("Evaluating on dev set...") dev_preds = trainer.predict_dict( dev_batch.doc.get_mwt_expansions(evaluation=True)) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.dict2conll(doc.to_dict(), system_pred_file) _, _, dev_f = scorer.score(system_pred_file, gold_file) logger.info("Dev F1 = {:.2f}".format(dev_f * 100)) if args.get('dict_only', False): # save dictionaries trainer.save(model_file) else: # train a seq2seq model logger.info("Training seq2seq-based MWT expander...") global_step = 0 max_steps = len(train_batch) * args['num_epoch'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # start training for epoch in range(1, args['num_epoch'] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, epoch, args['num_epoch'], loss, duration, current_lr)) # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for i, batch in enumerate(dev_batch): preds = trainer.predict(batch) dev_preds += preds if args.get('ensemble_dict', False) and args.get( 'ensemble_early_stop', False): logger.info("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble( dev_batch.doc.get_mwt_expansions(evaluation=True), dev_preds) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.dict2conll(doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / train_batch.num_examples * args[ 'batch_size'] # avg loss per batch logger.info( "epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format( epoch, train_loss, dev_score)) # save best model if epoch == 1 or dev_score > max(dev_score_history): trainer.save(model_file) logger.info("new best model saved.") best_dev_preds = dev_preds # lr schedule if epoch > args['decay_epoch'] and dev_score <= dev_score_history[ -1]: current_lr *= args['lr_decay'] trainer.change_lr(current_lr) dev_score_history += [dev_score] logger.info("Training ended with {} epochs.".format(epoch)) best_f, best_epoch = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 logger.info("Best dev F1 = {:.2f}, at epoch = {}".format( best_f, best_epoch)) # try ensembling with dict if necessary if args.get('ensemble_dict', False): logger.info("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble( dev_batch.doc.get_mwt_expansions(evaluation=True), best_dev_preds) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.dict2conll(doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) logger.info("Ensemble dev F1 = {:.2f}".format(dev_score * 100)) best_f = max(best_f, dev_score)
def process(self, text): sentence = [{ 'id': (i + 1, ), 'text': 'LOL' } for i, tok in enumerate(text.split())] return Document([sentence], text)
def __init__(self, gold, pred, verbose=False, group=False): """ Align golden and predicted tokens, and their tags. Create dictionaries of falsely predicted tags :param gold: the gold conllu file :param pred: the predicted conlly file :param verbose: if true print information about token numbers :param group: if true, put falsely predicted ufeats labels into a dictionary that contains all the labels it was falsely assigned and the number of times each predicted label was found """ gold = C.load_conll(open(gold, 'r', encoding='utf8')) gold_dic = C.convert_conll(gold) # returns a dictionary with all the column names gold_doc = Document(gold_dic) pred = C.load_conll(open(pred, 'r', encoding='utf8')) pred_dic = C.convert_conll(pred) # returns a dictionary with all the column names pred_doc = Document(pred_dic) # get the tokens self.gold_tokens = [j['text'] for i in gold_dic for j in i] self.pred_tokens = [j['text'] for i in pred_dic for j in i] # get upos tags gold_tags = [j['upos'] for i in gold_dic for j in i] pred_tags = [j['upos'] for i in pred_dic for j in i] # get xpos tags gold_xpos = [j['xpos'] for i in gold_dic for j in i] pred_xpos = [j['xpos'] for i in pred_dic for j in i] # get ufeats tag gold_feats = list() pred_feats = list() for i in gold_dic: for j in i: if 'feats' in j: gold_feats.append(j['feats']) else: gold_feats.append('_') for i in pred_dic: for j in i: if 'feats' in j: pred_feats.append(j['feats']) else: pred_feats.append('_') if verbose: print('Number of gold tokens:', len(self.gold_tokens), ', number of predicted tokens:', len(self.pred_tokens)) # align gold and predicted tokens cost, a2b, b2a, a2b_multi, b2a_multi = align(self.gold_tokens, self.pred_tokens) # align tokens and their POS tags separately self.aligned = list() # tokens self.aligned_pos = list() # upos self.aligned_feats = list() self.aligned_xpos = list() for i in range(len(b2a)): t = (self.gold_tokens[b2a[i]], self.pred_tokens[i]) self.aligned.append(t) p = (gold_tags[b2a[i]], pred_tags[i]) self.aligned_pos.append(p) f = (gold_feats[b2a[i]], pred_feats[i]) self.aligned_feats.append(f) x = (gold_xpos[b2a[i]], pred_xpos[i]) self.aligned_xpos.append(x) # align predicted tags to golden tags, not vice versa as before gold_aligned = list() for i in range(len(a2b)): t = (self.gold_tokens[i], self.pred_tokens[a2b[i]]) gold_aligned.append(t) overall = list() for (a, b) in self.aligned: if a == b: overall.append((a, b)) if verbose: print('Aligned tokens. GOLD:', len(gold_aligned), 'PREDICTED:', len(self.aligned), 'ALIGNED:', len(overall)) self.conf_tags = {} # falsely predicted upos tags self.conf_tags_all = {} # all upos tags self.incorrect_upos = 0 # number of incorrectly predicted upos tags # how many times different tags cooccured in gold and pred files i = 0 for (a, b) in self.aligned_pos: if a != b: self.incorrect_upos += 1 if (a, b) not in self.conf_tags: self.conf_tags[(a, b)] = 1 else: self.conf_tags[(a, b)] += 1 if (a, b) not in self.conf_tags_all: self.conf_tags_all[(a, b)] = 1 else: self.conf_tags_all[(a, b)] += 1 i += 1 self.conf_feats = {} self.conf_feats_all = {} self.incorrect_feats = 0 i = 0 for (a, b) in self.aligned_feats: a = "|".join(sorted(feat for feat in a.split("|") if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) b = "|".join(sorted(feat for feat in b.split("|") if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) if a != b: self.incorrect_feats += 1 # create a dictionary for each falsely predicted ufeats labels and group all its false predictions if group: if a not in self.conf_feats: self.conf_feats[a] = dict() self.conf_feats[a][b] = 1 else: if b not in self.conf_feats[a]: self.conf_feats[a][b] = 1 else: self.conf_feats[a][b] += 1 else: if (a, b) not in self.conf_feats: self.conf_feats[(a, b)] = 1 else: self.conf_feats[(a, b)] += 1 if (a, b) not in self.conf_feats_all: self.conf_feats_all[(a, b)] = 1 else: self.conf_feats_all[(a, b)] += 1 i += 1 self.conf_xpos = {} self.incorrect_xpos = 0 i = 0 for (a, b) in self.aligned_xpos: if a != b: self.incorrect_xpos += 1 if (a, b) not in self.conf_xpos: self.conf_xpos[(a, b)] = 1 else: self.conf_xpos[(a, b)] += 1 i += 1
def predict_with_pos(self, doc_dict): # fix potential memory leak torch.cuda.empty_cache() conllu_data = doc_dict["dep"] xml_data = doc_dict["xml"] conllu_data = re.sub( r'\n[0-9]+-[^\n]+\n', '\n', conllu_data ) # Remove any super tokens in input, we'll add them at the end # First parse - just get best deprel and heads diaparsed = diaparse(self.nlp1, conllu_data) doc = CoNLL.load_conll(io.StringIO(diaparsed)) # overwrite xpos with our ensemble xpos doc_with_our_xpos = CoNLL.conll2dict(input_str=conllu_data) replace_xpos(doc, doc_with_our_xpos) doc = [["\t".join(l) for l in sent] for sent in doc] doc = "\n\n".join(["\n".join(sent) for sent in doc]) # Second parse - postprocess based on: # 1. auxiliary parser predictions trained on EWT for PP attachment disambiguation ewt_parse = diaparse(self.aux_parser, conllu_data) doc = add_second_deps(doc, ewt_parse) # 2. sequence tagger deprel predictions using high quality POS tags and embeddings doc = add_sequence_tagger_preds(self.sequence_tagger, doc) # 3. postprocessing rules to adjudicate these predictions in a harmonized way doc = depedit1.run_depedit(doc) # Add upos uposed = depedit2.run_depedit(doc) uposed = [[l.split("\t") for l in s.split("\n")] for s in uposed.strip().split("\n\n")] dicts = CoNLL.convert_conll(uposed) # Now add lemmas using Stanza based on pretagged predicted upos (converted from our predicted xpos) for sent in dicts: for tok in sent: tok["id"] = int(tok["id"]) doc = Document(dicts) lemmatized = self.nlp2(doc) output = [] for sent in lemmatized.sentences: for tok in sent.tokens: word = tok.words[0] row = [ str(word.id), word.text, word.lemma, word.upos, word.xpos, '_', str(word.head), word.deprel, "_", "_" ] output.append("\t".join(row)) output.append("") lemmatized = "\n".join(output) # Postprocess implausible lemmas (VBG ending in -ed, VBN ending in -ing, etc., incorrect -e restoration...) lemmatized = postprocess_lemmas(lemmatized) # Fix punctuation lemmatized = fix_punct(lemmatized) if "<text id=" in xml_data: docname = re.search(r'<text id="([^"]+)"', xml_data).group(1) morphed_and_enhanced = depedit3.run_depedit(lemmatized, sent_id=True, sent_text=True, docname=docname, filename=docname) else: morphed_and_enhanced = depedit3.run_depedit(lemmatized, sent_text=True) if xml_data != "": xmled = conllu2xml(morphed_and_enhanced, xml_data) else: xmled = "" return {"dep": morphed_and_enhanced, "xml": xmled}