def main(): parser = argparse.ArgumentParser(description='GramEval Test') parser.add_argument('test_file', type=str, help='Input dir for test file') parser.add_argument('gold_file', type=str, help='Input dir for gold file') args = parser.parse_args() if len(sys.argv) !=2: print('evaluate.py <test_file> <gold_file>') #sys.exit(-1) print('loading files...') testfile = args.test_file goldfile = args.gold_file test_data = pyconll.load_from_file(testfile) gold_data = pyconll.load_from_file(goldfile) morph_score, lem_score, synt_score, pos_score, errors, alignment_score = compare(test_data, gold_data) quality = mean([morph_score, pos_score, lem_score, synt_score]) print('\nOverall quality:', quality) print('\nDetails:\n', '\nPOS quality:', pos_score,'\nMorphological features:', morph_score, '\nLemmatization:',lem_score, '\nUAS:', synt_score) print('\n Alignment score (should be 1.0, otherwise the tokenization is corrupted)', alignment_score, '\n') print(errors)
def evaluate(gold_conllu_filename, parsed_conllu_filename): gold_sentences = pyconll.load_from_file(gold_conllu_filename) parsed_sentences = pyconll.load_from_file(parsed_conllu_filename) assert len(gold_sentences) == len(parsed_sentences) overall_parsing_counts = defaultdict(lambda: { "predicted": 0, "gold": 0, "correct": 0 }) for gold_sentence, parsed_sentence in zip(gold_sentences, parsed_sentences): assert len(gold_sentence) == len(parsed_sentence) gold_relations = get_propagated_relations(gold_sentence) parsed_relations = get_propagated_relations(parsed_sentence) correct_relations = gold_relations & parsed_relations for gold_relation in gold_relations: label = get_simplified_label(gold_relation) overall_parsing_counts[label]["gold"] += 1 overall_parsing_counts["TOTAL"]["gold"] += 1 for parsed_relation in parsed_relations: label = get_simplified_label(parsed_relation) overall_parsing_counts[label]["predicted"] += 1 overall_parsing_counts["TOTAL"]["predicted"] += 1 for correct_relation in correct_relations: label = get_simplified_label(correct_relation) overall_parsing_counts[label]["correct"] += 1 overall_parsing_counts["TOTAL"]["correct"] += 1 return overall_parsing_counts, compute_prf_all_labels( overall_parsing_counts)
def create_dataset(UD_PATH, LANG_DIR, PREFIX): """ Read all the conll files (test, dev, train) and Args: UD_PATH: the universal dependencies main folder (viz. C:/Users/DELL/Downloads/ud-treebanks-v2.6/ud-treebanks-v2.6/) LANG_DIR: the subfolder in UD_PATH containing all the relevant conll data (viz. 'UD_Afrikaans-AfriBooms/', 'UD_Persian-Seraji/', etc. PREFIX: the prefix before each train/test/dev conll file, viz. 'af_afribooms-ud-', 'fa_seraji-ud-'; the UD files have a standard format like af_afribooms-ud-train.conllu, af_afribooms-ud-dev.conllu, af_afribooms-ud-test.conllu Returns: tag_list: list of all tags encountered, concatenated into a single list word_list: list of all words/punctuations encountered, concatenated into a single list X_train: list of all training+dev sentences, where each sentence is in turn a list of the words and punctuations in it. Y_train: list of all training+dev tags, corresponding to each sentence in X_train, same format as X_train X_test: list of all test sentences, where each sentence is in turn a list of the words and punctuations in it. Y_test: list of all test tags, corresponding to each sentence in X_test, same format as X_test """ train_conll = pyconll.load_from_file(UD_PATH+LANG_DIR+PREFIX+'train.conllu') word_list = [] tag_list = [] X_train = [] Y_train = [] #train data for sentence in train_conll: for word in sentence: word_list.append(word.lemma) tag_list.append(word.upos) X_train.append([word.lemma for word in sentence]) Y_train.append([word.upos for word in sentence]) #dev data #if dev file exists, then fuse it with the training dataset. Otherwise no worries. #We do this because our version of HMM doesn't involve iterative training. try: dev_conll = pyconll.load_from_file(UD_PATH+LANG_DIR+PREFIX+'dev.conllu') for sentence in dev_conll: for word in sentence: word_list.append(word.lemma) tag_list.append(word.upos) X_train.append([word.lemma for word in sentence]) Y_train.append([word.upos for word in sentence]) except: print("Dev set not found, no worries! Train set must be good enough.") #test data test_conll = pyconll.load_from_file(UD_PATH+LANG_DIR+PREFIX+'test.conllu') X_test = [] Y_test = [] for sentence in test_conll: X_test.append([word.lemma for word in sentence]) Y_test.append([word.upos for word in sentence]) #return return tag_list, word_list, X_train, Y_train, X_test, Y_test
def build_POS_tagging_data(source_treebank_name = "UD_Hebrew-HTB", git_hash = "82591c955e86222e32531336ff23e36c220b5846"): if not os.path.isdir(source_treebank_name) : print("fetching the data source from github...") subprocess.run(["git", "clone", "https://github.com/UniversalDependencies/" + source_treebank_name]) subprocess.run(["git", "checkout", git_hash], cwd=source_treebank_name) print("transforming the data source...") conllu1 = pyconll.load_from_file(Path.cwd().joinpath(source_treebank_name, get_single_file(source_treebank_name, 'train'))) conllu2 = pyconll.load_from_file(Path.cwd().joinpath(source_treebank_name, get_single_file(source_treebank_name, 'dev'))) return _conllu_transformer_POS_tagging(conllu1) + _conllu_transformer_POS_tagging(conllu2)
def build_verb_in_context_data(source_treebank_name = "UD_Hebrew-HTB", git_hash = "82591c955e86222e32531336ff23e36c220b5846"): ''' transforms conllu files to a lists where each entry is a segment with verb/non-verb indciation ''' if not os.path.isdir(source_treebank_name) : print("fetching the data source from github...") subprocess.run(["git", "clone", "https://github.com/UniversalDependencies/" + source_treebank_name]) subprocess.run(["git", "checkout", git_hash], cwd=source_treebank_name) print("transforming the data source...") conllu1 = pyconll.load_from_file(Path.cwd().joinpath(source_treebank_name, get_single_file(source_treebank_name, 'train'))) conllu2 = pyconll.load_from_file(Path.cwd().joinpath(source_treebank_name, get_single_file(source_treebank_name, 'dev'))) return _conllu_transformer(conllu1) + _conllu_transformer(conllu2)
def __init__(self, path, max_seq_len, model_type, max_count=10**6): sents, labels = [], [] count = 0 # filename should always be of the form <lang>.<split> self.lang = path.split("/")[-1].split(".")[0] cached_features_file = path + f".{max_count}.th" if os.path.exists(cached_features_file): logger.info( f"Loading features from cached file {cached_features_file}") self.features = torch.load(cached_features_file) else: logger.info( f"Saving features into cached file {cached_features_file}") tagged_sentences = pyconll.load_from_file(path) for ts in tagged_sentences: t, l = [], [] for token in ts: if token.upos and token.form: t.append(token.form) l.append(token.upos) for idx in range(0, len(ts), max_seq_len): sents.append(t[idx:idx + max_seq_len]) labels.append(l[idx:idx + max_seq_len]) count += 1 if count > max_count: break label_map = {l: idx for idx, l in enumerate(get_pos_labels())} tokenizer = AutoTokenizer.from_pretrained(model_type) self.features = convert_examples_to_features( sents, labels, label_map, max_seq_len, tokenizer) torch.save(self.features, cached_features_file)
def load_predictions(args): # Regular CoNLL-U format if args.pred_upos_index == 3 and args.pred_xpos_index == 4 and args.pred_feats_index == 5: return pyconll.load_from_file(args.prediction) # other format else: s = "" with open(args.prediction, 'r') as pred_file: for line in pred_file: if line.strip() == "": s += line elif line.startswith("#"): s += line else: elements = line.split("\t") if args.pred_upos_index >= 0 and args.pred_upos_index < len(elements): upos = elements[args.pred_upos_index].strip() else: upos = "_" if args.pred_xpos_index >= 0 and args.pred_xpos_index < len(elements): xpos = elements[args.pred_xpos_index].strip() else: xpos = "_" if args.pred_feats_index >= 0 and args.pred_feats_index < len(elements): feats = elements[args.pred_feats_index].strip() else: feats = "_" s += "0\t_\t_\t{}\t{}\t{}\t0\t_\t_\t_\n".format(upos, xpos, feats) return pyconll.load_from_string(s)
def main(): """Main method. Delete all previously changed treebanks to avoid mixing. Then changes the treebanks as specified. If more than one relations should be changed, treebanks for all combinations of changed relations are created. The input treebanks are searched in "ud/"; the output treebanks are saved in "ud-harmony/". """ shutil.rmtree("ud-harmony/" + treebank_path) os.mkdir("ud-harmony/" + treebank_path) for language in list(changes.keys()): change = changes[language] for i in range(1, 1 << len(change[2])): relations = [ change[2][j] for j in range(len(change[2])) if (i & (1 << j)) ] for t in change[0]: indir = "ud/" + treebank_path + "/UD_" + language + "-" + t outdir = "ud-harmony/" + treebank_path + "/UD_" + language + "-" + t if not os.path.exists(outdir): os.mkdir(outdir) outdir += "/" + "+".join(relations) if not os.path.exists(outdir): os.mkdir(outdir) for file in os.scandir(indir): if file.name.endswith(".conllu"): corpus = pyconll.load_from_file(file) corpus = change_corpus(language, corpus, relations) with open(outdir + "/" + file.name, "w") as f: f.write(corpus.conll())
def read_sequences(file_name): data = pyconll.load_from_file(file_name) tokens = [[token.form for token in sent] for sent in data] tags = [[token.upos for token in sent] for sent in data] return [tokens, tags]
def load_as_conllu(self, predefined_splits: bool = False): """ Load the DDT in CoNLL-U format. :param bool predefined_splits: :return: A single pyconll.Conll or a tuple of (train, dev, test) pyconll.Conll depending on predefined_split """ parts = [ None, None, None ] # Placeholder list to put predefined parts of dataset [train, dev, test] for i, part in enumerate(['train', 'dev', 'test']): file_name = "{}.{}{}".format(self.dataset_name, part, self.file_extension) file_path = os.path.join(self.dataset_dir, file_name) parts[i] = pyconll.load_from_file(file_path) # if predefined_splits: then we should return three files if predefined_splits: return parts # Merge the splits to one single dataset parts[0].extend(parts[1]) parts[0].extend(parts[2]) return parts[0]
def test_load_from_file_and_url_equivalence(): """ Test that the Conll object created from a string and file is the same if the underlying source is the same. """ TEST_CONLL_URL = 'https://myconllrepo.com/english/train' with open(fixture_location('long.conll')) as f: contents = f.read() responses.add(responses.GET, TEST_CONLL_URL, body=contents) url_c = load_from_url(TEST_CONLL_URL) file_c = load_from_file(fixture_location('long.conll')) assert len(url_c) == len(file_c) for i in range(len(url_c)): assert url_c[i].id == file_c[i].id assert url_c[i].text == file_c[i].text print(url_c[i].conll()) print(file_c[i].conll()) for url_token in url_c[i]: file_token = file_c[i][url_token.id] assert_token_members(url_token, file_token.id, file_token.form, file_token.lemma, file_token.upos, file_token.xpos, file_token.feats, file_token.head, file_token.deprel, file_token.deps, file_token.misc)
def ud_sentence_to_dict( udfile ): # just put space between each word token of tree, without any other processing data = pyconll.load_from_file(udfile) context_dict = {} for sentence in data: s_id = sentence.source.splitlines()[0] # sentence id _sentence = "" # print(_id) for token in sentence: wlist = token.conll().split() w = wlist[1] # word # --------------- Replace if w == '-LRB-': w = '(' if w == '-RRB-': w = ')' # ---------------- if not _sentence: _sentence = w continue else: _sentence = _sentence + ' ' + w # print(_sentence) context_dict[s_id] = _sentence return context_dict
def load_standardsplit(self): print("Loading Universal Dependencies " + self.config['data_path']) self.X_raw = defaultdict(list) # get list of files filenames = {} self.part_mapping = {"trn": "train", "dev": "dev", "tst": "test"} for file in os.listdir(self.config['data_path']): for part, part_conll in self.part_mapping.items(): mymatch = re.match(r".+?" + part_conll + "\.conllu", file) if mymatch is not None: filenames[part] = file assert len(filenames) == 3, "Some file is missing..." print(filenames) for part in self.part_mapping: print(part) # for part in ["dev"]: corpus = pyconll.load_from_file( os.path.join(self.config['data_path'], filenames[part])) for sentence in corpus: tmp_sentence = [] for token in sentence: if "-" in token.id: continue if self.config['upos']: tmp_sentence.append([token.form, token.upos]) else: tmp_sentence.append([token.form, token.xpos]) if self.config['shuffle_seqs']: random.shuffle(tmp_sentence) self.X_raw[part].append(np.array(tmp_sentence)) for part in ["trn", "dev", "tst"]: # for part in ["dev"]: self.X_raw[part] = np.array(self.X_raw[part]) print(part, self.X_raw[part].shape)
def __init__(self, data_dir, language): """"Initializes the dataset from the provided input data url""" assert self.dataset_fn is not None, "You need to use the subclasses of PerseusDataset" # First we try to load the vocab pickle, if it doesn't exist yet we must create it vocab_path = path.join(data_dir, "vocab-%s.p" % language) init_vocab = not path.isfile(vocab_path) if not init_vocab: print("Loading vocabulary from cache: %s" % vocab_path) with open(vocab_path, "rb") as f: self.vocab = pickle.load(f) else: print("Creating vocabulary for %s" % language) self.vocab = Vocab( words={"<UNK>": 0}, chars={"<UNK>": 0}, tags=[{"<UNK>": 0} for _ in range(self.NUM_TAGS)] ) # Now that we've decided if we have a premade vocab we're parsing the dataset into tokenized data that the model can work with self.sentences = [] for sentence in pyconll.load_from_file(path.join(data_dir, self.dataset_fn)): tokenized_sentence = [] for token in sentence: word, characters, tags = token.form, list(token.form), token.xpos assert len(tags) == 9, "Tags should always have a length of 9" tokenized_sentence.append(PerseusDataset.get_ids(self.vocab, word, characters, tags, expand_vocab=init_vocab)) self.sentences.append(tokenized_sentence) # If we have just created the vocabulary, let's save it just in case if init_vocab: self.save_vocab(vocab_path)
def evaluate_file(args, inconsistencies): print("Prediction file: ", args.prediction) print("Gold file: ", args.gold) pred_file = load_predictions(args) gold_file = pyconll.load_from_file(args.gold) if len(pred_file) != len(gold_file): print("Number of sentences does not match!") print("Prediction: {} Gold: {}".format(len(pred_file), len(gold_file))) return upos_evaluator = evaluator.Evaluator(mode="exact") xpos_evaluator = evaluator.Evaluator(mode="exact") feats_evaluator = evaluator.Evaluator(mode="by_feats") ufeats_evaluator = evaluator.Evaluator(mode="exact", only_univ=True) upos_feats_evaluator = evaluator.Evaluator(mode="by_feats") incons_count = 0 token_count = 0 for pred_sent, gold_sent in zip(pred_file, gold_file): if len(pred_sent) != len(gold_sent): print("Number of words in sentence does not match!") print("Prediction: {} Gold: {}".format(len(pred_sent), len(gold_sent))) print("Prediction:", pred_sent._meta) print("Gold:", gold_sent._meta) continue for pred_token, gold_token in zip(pred_sent, gold_sent): if args.upos: upos_evaluator.add_instance({POS: gold_token.upos}, {POS: pred_token.upos}) if args.xpos: xpos_evaluator.add_instance({POS: gold_token.xpos}, {POS: pred_token.xpos}) if args.feats: gold_feats = {x: ",".join(gold_token.feats[x]) for x in gold_token.feats} pred_feats = {x: ",".join(pred_token.feats[x]) for x in pred_token.feats} feats_evaluator.add_instance(gold_feats, pred_feats) ufeats_evaluator.add_instance(gold_feats, pred_feats) if args.upos: if args.incons: token_count += 1 if len(set(pred_feats.keys()) & inconsistencies[pred_token.upos]) > 0: incons_count += 1 gold_feats.update({POS: gold_token.upos}) pred_feats.update({POS: pred_token.upos}) upos_feats_evaluator.add_instance(gold_feats, pred_feats) if upos_evaluator.instance_count > 0: print("UPOS accuracy {:.2f}%".format(100*upos_evaluator.acc())) if xpos_evaluator.instance_count > 0: print("XPOS accuracy {:.2f}%".format(100*xpos_evaluator.acc())) if feats_evaluator.instance_count > 0: print("FEATS micro-F1 {:.2f}%".format(100*feats_evaluator.micro_f1())) if upos_feats_evaluator.instance_count > 0: print("UPOS+FEATS micro-F1 {:.2f}%".format(100*upos_feats_evaluator.micro_f1())) if ufeats_evaluator.instance_count > 0: print("UFEATS accuracy {:.2f}%".format(100*ufeats_evaluator.acc())) if token_count > 0: print("UFEATS inconsistencies {:.2f}%".format(100*incons_count / token_count)) print()
def get_dane_data(split: str = 'train', limit: int = None, dir: str = None) -> dict: """Load DaNE data split. Loads a single data split from the DaNE data set kindly hosted by [Alexandra Institute](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane). Args: split (str, optional): Choose which split to load. Choose from 'train', 'dev' and 'test'. Defaults to 'train'. limit (int, optional): Limit the number of observations to be returned from a given split. Defaults to None, which implies that the entire data split is returned. dir (str, optional): Directory where data is cached. If set to None, the function will try to look for files in '.dane' folder in home directory. Returns: dict: Dictionary with word-tokenized 'sentences' and named entity 'tags' in IOB format. Examples: Get test split >>> get_dane_data('test') Get first 5 observations from training split >>> get_dane_data('train', limit = 5) """ assert isinstance(split, str) splits = ['train', 'dev', 'test'] assert split in splits, f'Choose between the following splits: {splits}' # set to default directory if nothing else has been provided by user. if dir is None: dir = os.path.join(str(Path.home()), '.dane') assert os.path.isdir( dir ), f'Directory {dir} does not exist. Try downloading DaNE data with download_dane_data()' file_path = os.path.join(dir, f'ddt.{split}.conllu') assert os.path.isfile( file_path ), f'File {file_path} does not exist. Try downloading DaNE data with download_dane_data()' split = pyconll.load_from_file(file_path) sentences = [] entities = [] for sent in split: sentences.append([token.form for token in sent._tokens]) entities.append([token.misc['name'].pop() for token in sent._tokens]) if limit is not None: sentences = sentences[:limit] entities = entities[:limit] return {'sentences': sentences, 'tags': entities}
def main(args: Namespace): print("Configurations: " + str(args)) training_set = pyconll.load_from_file(args.training_set_path) if args.validation_set_path is not None: training_set += pyconll.load_from_file(args.validation_set_path) data = Do.DataOrganizer(training_set=training_set, storing_method=args.storing_method, storing_prob_method=args.storing_prob_method) va = ViterbiAlgorithm(data=data, smoothing_type=args.smoothing_method) accuracy, baseline, unknown_word = test_accuracy( file_path=args.test_set_path, viterbi_algorithm=va, storing_method=args.storing_method) print("baseline: " + str(baseline) + "%") print("Pos tagger accuracy: " + str(accuracy) + "%") print("unknown words: " + str(unknown_word))
def read_conll(file_name): data = pyconll.load_from_file(file_name) tags = [[ token.upos if token.upos in ["C", "E"] else "_" for token in sent ] for sent in data] return tags
def _conllu_to_tokens(path: str) -> Set[Dict[str, str]]: """Return the annotated tokens from a CoNLL-U file.""" tokens = set() for sentence in tqdm.tqdm(pyconll.load_from_file(path)): for token in sentence: tokens.add(_HashableDict(_flatten_token(token))) return tokens
def create_reinflection_file(in_file, out_file): """ :param in_file: UD annotated input file name :param out_file: output file name """ conll = load_from_file(in_file) lines = get_lines(conll) with open(out_file, "w") as f: f.writelines(lines)
def test_no_nonprojectivities(): """ Test with a sentence with no non-projective dependencies. """ c = load_from_file(fixture_location('projectivities.conll')) sent = c[0] deps = find_nonprojective_deps(sent) assert not deps
def read_conll(file_name): data = pyconll.load_from_file(file_name) tags = [[ "<span class='" + (token.upos if token.upos in ["C", "E"] else "O") + "'>" + token.form + "</span>" for token in sent ] for sent in data] return tags
def test_ngram_standard(): """ Test if the find_ngram method works for standard situations. """ c = load_from_file(fixture_location('basic.conll')) s, i = next(find_ngrams(c, 'un film sur la'.split())) assert s.id == 'fr-ud-dev_00001' assert i == 2
def test_ngram_first_word_match(): """ Test that a first word match is not enough to match. """ c = load_from_file(fixture_location('long.conll')) it = find_ngrams(c, 'un cabinet'.split()) with pytest.raises(StopIteration): next(it)
def test_ngram_none(): """ Test that no ngram is identified when no exist """ c = load_from_file(fixture_location('long.conll')) it = find_ngrams(c, 'cabinet'.split()) with pytest.raises(StopIteration): next(it)
def _change_galician_cc_extract_feature_vectors(file): """Extract feature vectors for the non-"cc" dependents of the first conjunct in a conjunction. (The dependents can be conjuncts of the conjunction or other dependents of the first conjunct.) Args: file (str): "train" or "test" Load the according TreeGal treebank. Returns: list of (list of int): The feature vectors. list of int: The classes. 1 for conjuncts. 0 for other dependents. """ true_conjuncts = [] false_conjuncts = [] train_corpus = pyconll.load_from_file( "ud/" + treebank_path + "/UD_Galician-TreeGal/gl_treegal-ud-" + file + ".conllu") for sentence in train_corpus: for token in sentence: if token.deprel is None: continue dep_label = token.deprel if dep_label == "cc": conjunction = token.id conjunct_ = token.head conjunct1 = sentence[conjunct_].head if conjunct1 != "0": for token2 in sentence: if token2.head == conjunct1: tokens = [token2.id] ccs = [] a = True while a: a = False for t in sentence: if t.head in tokens and t.id not in tokens + ccs: if t.deprel == "cc": ccs.append(t.id) else: tokens.append(t.id) a = True tokens = sorted(tokens) d = _change_galician_cc_mask( sentence[conjunct1].upos, token2.upos, int(token2.id) - int(conjunct1), str(int(tokens[0]) - 1) in ccs) if token2.deprel == "conj": true_conjuncts.append(d) else: false_conjuncts.append(d) X = true_conjuncts + false_conjuncts y = [1 for x in true_conjuncts] + [0 for x in false_conjuncts] return X, y
def ud_to_dict(udfile): data = pyconll.load_from_file(udfile) tree_dict = {} for sentence in data: sent_lines = sentence.source.splitlines() s_id = sent_lines[0] tree_dict[s_id] = sent_lines[1:] return tree_dict
def __init__(self, train, dev, test, use_v1, hack_v2): """ Initializer :param train: file name of training set :param dev: file name of development set :param test: file name of test set :param use_v1: True if sentence is annotated using UD V1.2 """ self.samples = [] train_conll = load_from_file(train) dev_conll = load_from_file(dev) test_conll = load_from_file(test) self.train = samples_from_conll(train_conll, use_v1, hack_v2) self.dev = samples_from_conll(dev_conll, use_v1, hack_v2) self.test = samples_from_conll(test_conll, use_v1, hack_v2) self.tags = get_tags(self.train + self.dev + self.test) self._num_pos = get_num_upos(use_v1) self._num_labels = get_num_rel(use_v1)
def test_load_from_file(): """ Test that a CoNLL file can properly be loaded from a filename. """ c = load_from_file(fixture_location('basic.conll')) sent = c[1] assert len(c) == 4 assert len(sent) == 14 assert sent['10'].form == 'donc'
def test_multiword_ignore(): """ Test that multiword tokens are ignored and do not cause errors. """ c = load_from_file(fixture_location('projectivities.conll')) sent = c[3] deps = find_nonprojective_deps(sent) assert deps == [(sent['16'], sent['4'])]