def main():
    parser = argparse.ArgumentParser(description='GramEval Test')
    parser.add_argument('test_file', type=str, help='Input dir for test file')
    parser.add_argument('gold_file', type=str, help='Input dir for gold file')
    args = parser.parse_args()
    if len(sys.argv) !=2: 
        print('evaluate.py <test_file> <gold_file>')
        #sys.exit(-1)

    print('loading files...')

    testfile = args.test_file
    goldfile = args.gold_file

    test_data = pyconll.load_from_file(testfile)
    gold_data = pyconll.load_from_file(goldfile)

    morph_score, lem_score, synt_score, pos_score, errors, alignment_score = compare(test_data, gold_data)

    quality = mean([morph_score, pos_score, lem_score, synt_score])

    print('\nOverall quality:', quality)

    print('\nDetails:\n', '\nPOS quality:', pos_score,'\nMorphological features:', morph_score, '\nLemmatization:',lem_score, '\nUAS:', synt_score)

    print('\n Alignment score (should be 1.0, otherwise the tokenization is corrupted)', alignment_score, '\n')
    print(errors)
def evaluate(gold_conllu_filename, parsed_conllu_filename):
    gold_sentences = pyconll.load_from_file(gold_conllu_filename)
    parsed_sentences = pyconll.load_from_file(parsed_conllu_filename)

    assert len(gold_sentences) == len(parsed_sentences)

    overall_parsing_counts = defaultdict(lambda: {
        "predicted": 0,
        "gold": 0,
        "correct": 0
    })
    for gold_sentence, parsed_sentence in zip(gold_sentences,
                                              parsed_sentences):
        assert len(gold_sentence) == len(parsed_sentence)

        gold_relations = get_propagated_relations(gold_sentence)
        parsed_relations = get_propagated_relations(parsed_sentence)
        correct_relations = gold_relations & parsed_relations

        for gold_relation in gold_relations:
            label = get_simplified_label(gold_relation)
            overall_parsing_counts[label]["gold"] += 1
            overall_parsing_counts["TOTAL"]["gold"] += 1
        for parsed_relation in parsed_relations:
            label = get_simplified_label(parsed_relation)
            overall_parsing_counts[label]["predicted"] += 1
            overall_parsing_counts["TOTAL"]["predicted"] += 1
        for correct_relation in correct_relations:
            label = get_simplified_label(correct_relation)
            overall_parsing_counts[label]["correct"] += 1
            overall_parsing_counts["TOTAL"]["correct"] += 1

    return overall_parsing_counts, compute_prf_all_labels(
        overall_parsing_counts)
def create_dataset(UD_PATH, LANG_DIR, PREFIX):
    """
    Read all the conll files (test, dev, train) and 
    
    Args:
    UD_PATH:  the universal dependencies main folder (viz. C:/Users/DELL/Downloads/ud-treebanks-v2.6/ud-treebanks-v2.6/)
    LANG_DIR: the subfolder in UD_PATH containing all the relevant conll data (viz. 'UD_Afrikaans-AfriBooms/', 
              'UD_Persian-Seraji/', etc.
    PREFIX:   the prefix before each train/test/dev conll file, viz. 'af_afribooms-ud-', 'fa_seraji-ud-';
              the UD files have a standard format like af_afribooms-ud-train.conllu, af_afribooms-ud-dev.conllu, 
              af_afribooms-ud-test.conllu
    
    Returns:
    tag_list:     list of all tags encountered, concatenated into a single list
    word_list:    list of all words/punctuations encountered, concatenated into a single list
    X_train:      list of all training+dev sentences, where each sentence is in turn a list of the words and punctuations in it.
    Y_train:      list of all training+dev tags, corresponding to each sentence in X_train, same format as X_train
    X_test:       list of all test sentences, where each sentence is in turn a list of the words and punctuations in it.
    Y_test:       list of all test tags, corresponding to each sentence in X_test, same format as X_test
    """
    train_conll = pyconll.load_from_file(UD_PATH+LANG_DIR+PREFIX+'train.conllu')
    word_list = []
    tag_list = []
    X_train = []
    Y_train = []
    #train data
    for sentence in train_conll:
        for word in sentence:
            word_list.append(word.lemma)
            tag_list.append(word.upos)
        X_train.append([word.lemma for word in sentence])
        Y_train.append([word.upos for word in sentence])
    #dev data
    #if dev file exists, then fuse it with the training dataset. Otherwise no worries.
    #We do this because our version of HMM doesn't involve iterative training.
    try:
        dev_conll = pyconll.load_from_file(UD_PATH+LANG_DIR+PREFIX+'dev.conllu')
        for sentence in dev_conll:
            for word in sentence:
                word_list.append(word.lemma)
                tag_list.append(word.upos)
        X_train.append([word.lemma for word in sentence])
        Y_train.append([word.upos for word in sentence])
    except:
        print("Dev set not found, no worries! Train set must be good enough.")
    #test data
    test_conll = pyconll.load_from_file(UD_PATH+LANG_DIR+PREFIX+'test.conllu')
    X_test = []
    Y_test = []
    for sentence in test_conll:
        X_test.append([word.lemma for word in sentence])
        Y_test.append([word.upos for word in sentence])
    #return 
    return tag_list, word_list, X_train, Y_train, X_test, Y_test
Exemple #4
0
def build_POS_tagging_data(source_treebank_name = "UD_Hebrew-HTB", git_hash = "82591c955e86222e32531336ff23e36c220b5846"):
    if not os.path.isdir(source_treebank_name) :
        print("fetching the data source from github...")
        subprocess.run(["git", "clone", "https://github.com/UniversalDependencies/" + source_treebank_name])
        subprocess.run(["git", "checkout", git_hash], cwd=source_treebank_name)    

    print("transforming the data source...")
    
    conllu1 = pyconll.load_from_file(Path.cwd().joinpath(source_treebank_name, get_single_file(source_treebank_name, 'train')))
    conllu2 = pyconll.load_from_file(Path.cwd().joinpath(source_treebank_name, get_single_file(source_treebank_name, 'dev')))

    return _conllu_transformer_POS_tagging(conllu1) + _conllu_transformer_POS_tagging(conllu2)
Exemple #5
0
def build_verb_in_context_data(source_treebank_name = "UD_Hebrew-HTB", git_hash = "82591c955e86222e32531336ff23e36c220b5846"):
    ''' transforms conllu files to a lists where each entry is a segment with verb/non-verb indciation '''
        
    if not os.path.isdir(source_treebank_name) :
        print("fetching the data source from github...")
        subprocess.run(["git", "clone", "https://github.com/UniversalDependencies/" + source_treebank_name])
        subprocess.run(["git", "checkout", git_hash], cwd=source_treebank_name)    

    print("transforming the data source...")
    
    conllu1 = pyconll.load_from_file(Path.cwd().joinpath(source_treebank_name, get_single_file(source_treebank_name, 'train')))
    conllu2 = pyconll.load_from_file(Path.cwd().joinpath(source_treebank_name, get_single_file(source_treebank_name, 'dev')))

    return _conllu_transformer(conllu1) + _conllu_transformer(conllu2)
            
Exemple #6
0
 def __init__(self, path, max_seq_len, model_type, max_count=10**6):
     sents, labels = [], []
     count = 0
     # filename should always be of the form <lang>.<split>
     self.lang = path.split("/")[-1].split(".")[0]
     cached_features_file = path + f".{max_count}.th"
     if os.path.exists(cached_features_file):
         logger.info(
             f"Loading features from cached file {cached_features_file}")
         self.features = torch.load(cached_features_file)
     else:
         logger.info(
             f"Saving features into cached file {cached_features_file}")
         tagged_sentences = pyconll.load_from_file(path)
         for ts in tagged_sentences:
             t, l = [], []
             for token in ts:
                 if token.upos and token.form:
                     t.append(token.form)
                     l.append(token.upos)
             for idx in range(0, len(ts), max_seq_len):
                 sents.append(t[idx:idx + max_seq_len])
                 labels.append(l[idx:idx + max_seq_len])
             count += 1
             if count > max_count:
                 break
         label_map = {l: idx for idx, l in enumerate(get_pos_labels())}
         tokenizer = AutoTokenizer.from_pretrained(model_type)
         self.features = convert_examples_to_features(
             sents, labels, label_map, max_seq_len, tokenizer)
         torch.save(self.features, cached_features_file)
def load_predictions(args):
	# Regular CoNLL-U format
	if args.pred_upos_index == 3 and args.pred_xpos_index == 4 and args.pred_feats_index == 5:
		return pyconll.load_from_file(args.prediction)
	
	# other format
	else:
		s = ""
		with open(args.prediction, 'r') as pred_file:
			for line in pred_file:
				if line.strip() == "":
					s += line
				elif line.startswith("#"):
					s += line
				else:
					elements = line.split("\t")
					if args.pred_upos_index >= 0 and args.pred_upos_index < len(elements):
						upos = elements[args.pred_upos_index].strip()
					else:
						upos = "_"
					if args.pred_xpos_index >= 0 and args.pred_xpos_index < len(elements):
						xpos = elements[args.pred_xpos_index].strip()
					else:
						xpos = "_"
					if args.pred_feats_index >= 0 and args.pred_feats_index < len(elements):
						feats = elements[args.pred_feats_index].strip()
					else:
						feats = "_"
					s += "0\t_\t_\t{}\t{}\t{}\t0\t_\t_\t_\n".format(upos, xpos, feats)
		return pyconll.load_from_string(s)
Exemple #8
0
def main():
    """Main method.
		Delete all previously changed treebanks to avoid mixing. Then changes the treebanks as specified.
		If more than one relations should be changed, treebanks for all combinations of changed relations are created.
		The input treebanks are searched in "ud/"; the output treebanks are saved in "ud-harmony/".

	"""
    shutil.rmtree("ud-harmony/" + treebank_path)
    os.mkdir("ud-harmony/" + treebank_path)
    for language in list(changes.keys()):
        change = changes[language]
        for i in range(1, 1 << len(change[2])):
            relations = [
                change[2][j] for j in range(len(change[2])) if (i & (1 << j))
            ]
            for t in change[0]:
                indir = "ud/" + treebank_path + "/UD_" + language + "-" + t
                outdir = "ud-harmony/" + treebank_path + "/UD_" + language + "-" + t
                if not os.path.exists(outdir):
                    os.mkdir(outdir)
                outdir += "/" + "+".join(relations)
                if not os.path.exists(outdir):
                    os.mkdir(outdir)
                for file in os.scandir(indir):
                    if file.name.endswith(".conllu"):
                        corpus = pyconll.load_from_file(file)
                        corpus = change_corpus(language, corpus, relations)
                        with open(outdir + "/" + file.name, "w") as f:
                            f.write(corpus.conll())
Exemple #9
0
    def read_sequences(file_name):
        data = pyconll.load_from_file(file_name)

        tokens = [[token.form for token in sent] for sent in data]
        tags = [[token.upos for token in sent] for sent in data]

        return [tokens, tags]
Exemple #10
0
    def load_as_conllu(self, predefined_splits: bool = False):
        """
        Load the DDT in CoNLL-U format.

        :param bool predefined_splits:
        :return: A single pyconll.Conll
                or a tuple of (train, dev, test) pyconll.Conll
                depending on predefined_split
        """

        parts = [
            None, None, None
        ]  # Placeholder list to put predefined parts of dataset [train, dev, test]
        for i, part in enumerate(['train', 'dev', 'test']):
            file_name = "{}.{}{}".format(self.dataset_name, part,
                                         self.file_extension)
            file_path = os.path.join(self.dataset_dir, file_name)

            parts[i] = pyconll.load_from_file(file_path)

        # if predefined_splits: then we should return three files
        if predefined_splits:
            return parts

        # Merge the splits to one single dataset
        parts[0].extend(parts[1])
        parts[0].extend(parts[2])

        return parts[0]
Exemple #11
0
def test_load_from_file_and_url_equivalence():
    """
    Test that the Conll object created from a string and file is the same if
    the underlying source is the same.
    """
    TEST_CONLL_URL = 'https://myconllrepo.com/english/train'
    with open(fixture_location('long.conll')) as f:
        contents = f.read()
        responses.add(responses.GET, TEST_CONLL_URL, body=contents)

    url_c = load_from_url(TEST_CONLL_URL)
    file_c = load_from_file(fixture_location('long.conll'))

    assert len(url_c) == len(file_c)
    for i in range(len(url_c)):
        assert url_c[i].id == file_c[i].id
        assert url_c[i].text == file_c[i].text
        print(url_c[i].conll())
        print(file_c[i].conll())

        for url_token in url_c[i]:
            file_token = file_c[i][url_token.id]
            assert_token_members(url_token, file_token.id, file_token.form,
                                 file_token.lemma, file_token.upos,
                                 file_token.xpos, file_token.feats,
                                 file_token.head, file_token.deprel,
                                 file_token.deps, file_token.misc)
Exemple #12
0
def ud_sentence_to_dict(
    udfile
):  # just put space between each word token of tree, without any other processing

    data = pyconll.load_from_file(udfile)
    context_dict = {}

    for sentence in data:
        s_id = sentence.source.splitlines()[0]  # sentence id
        _sentence = ""
        #       print(_id)
        for token in sentence:

            wlist = token.conll().split()
            w = wlist[1]  # word
            # --------------- Replace
            if w == '-LRB-':
                w = '('
            if w == '-RRB-':
                w = ')'
            # ----------------

            if not _sentence:
                _sentence = w
                continue

            else:
                _sentence = _sentence + ' ' + w

        # print(_sentence)
        context_dict[s_id] = _sentence
    return context_dict
Exemple #13
0
    def load_standardsplit(self):
        print("Loading Universal Dependencies " + self.config['data_path'])
        self.X_raw = defaultdict(list)
        # get list of files
        filenames = {}
        self.part_mapping = {"trn": "train", "dev": "dev", "tst": "test"}
        for file in os.listdir(self.config['data_path']):
            for part, part_conll in self.part_mapping.items():
                mymatch = re.match(r".+?" + part_conll + "\.conllu", file)
                if mymatch is not None:
                    filenames[part] = file

        assert len(filenames) == 3, "Some file is missing..."
        print(filenames)
        for part in self.part_mapping:
            print(part)
            # for part in ["dev"]:
            corpus = pyconll.load_from_file(
                os.path.join(self.config['data_path'], filenames[part]))
            for sentence in corpus:
                tmp_sentence = []
                for token in sentence:
                    if "-" in token.id:
                        continue
                    if self.config['upos']:
                        tmp_sentence.append([token.form, token.upos])
                    else:
                        tmp_sentence.append([token.form, token.xpos])
                if self.config['shuffle_seqs']:
                    random.shuffle(tmp_sentence)
                self.X_raw[part].append(np.array(tmp_sentence))
        for part in ["trn", "dev", "tst"]:
            # for part in ["dev"]:
            self.X_raw[part] = np.array(self.X_raw[part])
            print(part, self.X_raw[part].shape)
Exemple #14
0
    def __init__(self, data_dir, language):
        """"Initializes the dataset from the provided input data url"""
        assert self.dataset_fn is not None, "You need to use the subclasses of PerseusDataset"

        # First we try to load the vocab pickle, if it doesn't exist yet we must create it
        vocab_path = path.join(data_dir, "vocab-%s.p" % language)
        init_vocab = not path.isfile(vocab_path)
        if not init_vocab:
            print("Loading vocabulary from cache: %s" % vocab_path)
            with open(vocab_path, "rb") as f:
                self.vocab = pickle.load(f)
        else:
            print("Creating vocabulary for %s" % language)
            self.vocab = Vocab(
                words={"<UNK>": 0},
                chars={"<UNK>": 0},
                tags=[{"<UNK>": 0} for _ in range(self.NUM_TAGS)]
            )

        # Now that we've decided if we have a premade vocab we're parsing the dataset into tokenized data that the model can work with
        self.sentences = []
        for sentence in pyconll.load_from_file(path.join(data_dir, self.dataset_fn)):
            tokenized_sentence = []
            for token in sentence:
                word, characters, tags = token.form, list(token.form), token.xpos
                assert len(tags) == 9, "Tags should always have a length of 9"

                tokenized_sentence.append(PerseusDataset.get_ids(self.vocab, word, characters, tags, expand_vocab=init_vocab))

            self.sentences.append(tokenized_sentence)

        # If we have just created the vocabulary, let's save it just in case
        if init_vocab: self.save_vocab(vocab_path)
Exemple #15
0
def evaluate_file(args, inconsistencies):
	print("Prediction file:  ", args.prediction)
	print("Gold file:        ", args.gold)
	pred_file = load_predictions(args)
	gold_file = pyconll.load_from_file(args.gold)

	if len(pred_file) != len(gold_file):
		print("Number of sentences does not match!")
		print("Prediction: {}    Gold: {}".format(len(pred_file), len(gold_file)))
		return
	
	upos_evaluator = evaluator.Evaluator(mode="exact")
	xpos_evaluator = evaluator.Evaluator(mode="exact")
	feats_evaluator = evaluator.Evaluator(mode="by_feats")
	ufeats_evaluator = evaluator.Evaluator(mode="exact", only_univ=True)
	upos_feats_evaluator = evaluator.Evaluator(mode="by_feats")
	incons_count = 0
	token_count = 0
	
	for pred_sent, gold_sent in zip(pred_file, gold_file):
		if len(pred_sent) != len(gold_sent):
			print("Number of words in sentence does not match!")
			print("Prediction: {}    Gold: {}".format(len(pred_sent), len(gold_sent)))
			print("Prediction:", pred_sent._meta)
			print("Gold:", gold_sent._meta)
			continue
		
		for pred_token, gold_token in zip(pred_sent, gold_sent):
			if args.upos:
				upos_evaluator.add_instance({POS: gold_token.upos}, {POS: pred_token.upos})
			if args.xpos:
				xpos_evaluator.add_instance({POS: gold_token.xpos}, {POS: pred_token.xpos})
			if args.feats:
				gold_feats = {x: ",".join(gold_token.feats[x]) for x in gold_token.feats}
				pred_feats = {x: ",".join(pred_token.feats[x]) for x in pred_token.feats}
				feats_evaluator.add_instance(gold_feats, pred_feats)
				ufeats_evaluator.add_instance(gold_feats, pred_feats)
				if args.upos:
					if args.incons:
						token_count += 1
						if len(set(pred_feats.keys()) & inconsistencies[pred_token.upos]) > 0:
							incons_count += 1
					gold_feats.update({POS: gold_token.upos})
					pred_feats.update({POS: pred_token.upos})
					upos_feats_evaluator.add_instance(gold_feats, pred_feats)
	
	if upos_evaluator.instance_count > 0:
		print("UPOS accuracy          {:.2f}%".format(100*upos_evaluator.acc()))
	if xpos_evaluator.instance_count > 0:
		print("XPOS accuracy          {:.2f}%".format(100*xpos_evaluator.acc()))
	if feats_evaluator.instance_count > 0:
		print("FEATS micro-F1         {:.2f}%".format(100*feats_evaluator.micro_f1()))
	if upos_feats_evaluator.instance_count > 0:
		print("UPOS+FEATS micro-F1    {:.2f}%".format(100*upos_feats_evaluator.micro_f1()))
	if ufeats_evaluator.instance_count > 0:
		print("UFEATS accuracy        {:.2f}%".format(100*ufeats_evaluator.acc()))
	if token_count > 0:
		print("UFEATS inconsistencies {:.2f}%".format(100*incons_count / token_count))
	print()
Exemple #16
0
def get_dane_data(split: str = 'train',
                  limit: int = None,
                  dir: str = None) -> dict:
    """Load DaNE data split.

    Loads a single data split from the DaNE data set kindly hosted
    by [Alexandra Institute](https://github.com/alexandrainst/danlp/blob/master/docs/docs/datasets.md#dane).

    Args:
        split (str, optional): Choose which split to load. Choose 
            from 'train', 'dev' and 'test'. Defaults to 'train'.
        limit (int, optional): Limit the number of observations to be 
            returned from a given split. Defaults to None, which implies 
            that the entire data split is returned.
        dir (str, optional): Directory where data is cached. If set to 
            None, the function will try to look for files in '.dane' folder in home directory.

    Returns:
        dict: Dictionary with word-tokenized 'sentences' and named 
        entity 'tags' in IOB format.

    Examples:
        Get test split
        >>> get_dane_data('test')

        Get first 5 observations from training split
        >>> get_dane_data('train', limit = 5)

    """
    assert isinstance(split, str)
    splits = ['train', 'dev', 'test']
    assert split in splits, f'Choose between the following splits: {splits}'

    # set to default directory if nothing else has been provided by user.
    if dir is None:
        dir = os.path.join(str(Path.home()), '.dane')
    assert os.path.isdir(
        dir
    ), f'Directory {dir} does not exist. Try downloading DaNE data with download_dane_data()'

    file_path = os.path.join(dir, f'ddt.{split}.conllu')
    assert os.path.isfile(
        file_path
    ), f'File {file_path} does not exist. Try downloading DaNE data with download_dane_data()'

    split = pyconll.load_from_file(file_path)

    sentences = []
    entities = []

    for sent in split:
        sentences.append([token.form for token in sent._tokens])
        entities.append([token.misc['name'].pop() for token in sent._tokens])

    if limit is not None:
        sentences = sentences[:limit]
        entities = entities[:limit]

    return {'sentences': sentences, 'tags': entities}
def main(args: Namespace):
    print("Configurations: " + str(args))

    training_set = pyconll.load_from_file(args.training_set_path)
    if args.validation_set_path is not None:
        training_set += pyconll.load_from_file(args.validation_set_path)
    data = Do.DataOrganizer(training_set=training_set,
                            storing_method=args.storing_method,
                            storing_prob_method=args.storing_prob_method)
    va = ViterbiAlgorithm(data=data, smoothing_type=args.smoothing_method)
    accuracy, baseline, unknown_word = test_accuracy(
        file_path=args.test_set_path,
        viterbi_algorithm=va,
        storing_method=args.storing_method)
    print("baseline: " + str(baseline) + "%")
    print("Pos tagger accuracy: " + str(accuracy) + "%")
    print("unknown words: " + str(unknown_word))
Exemple #18
0
def read_conll(file_name):
    data = pyconll.load_from_file(file_name)

    tags = [[
        token.upos if token.upos in ["C", "E"] else "_" for token in sent
    ] for sent in data]

    return tags
Exemple #19
0
def _conllu_to_tokens(path: str) -> Set[Dict[str, str]]:
    """Return the annotated tokens from a CoNLL-U file."""

    tokens = set()
    for sentence in tqdm.tqdm(pyconll.load_from_file(path)):
        for token in sentence:
            tokens.add(_HashableDict(_flatten_token(token)))
    return tokens
Exemple #20
0
def create_reinflection_file(in_file, out_file):
    """
    :param in_file: UD annotated input file name
    :param out_file: output file name
    """
    conll = load_from_file(in_file)
    lines = get_lines(conll)
    with open(out_file, "w") as f:
        f.writelines(lines)
Exemple #21
0
def test_no_nonprojectivities():
    """
    Test with a sentence with no non-projective dependencies.
    """
    c = load_from_file(fixture_location('projectivities.conll'))
    sent = c[0]
    deps = find_nonprojective_deps(sent)

    assert not deps
Exemple #22
0
def read_conll(file_name):
    data = pyconll.load_from_file(file_name)

    tags = [[
        "<span class='" + (token.upos if token.upos in ["C", "E"] else "O") +
        "'>" + token.form + "</span>" for token in sent
    ] for sent in data]

    return tags
Exemple #23
0
def test_ngram_standard():
    """
    Test if the find_ngram method works for standard situations.
    """
    c = load_from_file(fixture_location('basic.conll'))

    s, i = next(find_ngrams(c, 'un film sur la'.split()))
    assert s.id == 'fr-ud-dev_00001'
    assert i == 2
Exemple #24
0
def test_ngram_first_word_match():
    """
    Test that a first word match is not enough to match.
    """
    c = load_from_file(fixture_location('long.conll'))
    it = find_ngrams(c, 'un cabinet'.split())

    with pytest.raises(StopIteration):
        next(it)
Exemple #25
0
def test_ngram_none():
    """
    Test that no ngram is identified when no exist
    """
    c = load_from_file(fixture_location('long.conll'))
    it = find_ngrams(c, 'cabinet'.split())

    with pytest.raises(StopIteration):
        next(it)
Exemple #26
0
def _change_galician_cc_extract_feature_vectors(file):
    """Extract feature vectors for the non-"cc" dependents of the first conjunct in a conjunction.
		(The dependents can be conjuncts of the conjunction or other dependents of the first conjunct.)
		
	Args:
		file (str): "train" or "test"
			Load the according TreeGal treebank.
	
	Returns:
		list of (list of int): The feature vectors.
		list of int: The classes.
			1 for conjuncts.
			0 for other dependents.

	"""
    true_conjuncts = []
    false_conjuncts = []
    train_corpus = pyconll.load_from_file(
        "ud/" + treebank_path + "/UD_Galician-TreeGal/gl_treegal-ud-" + file +
        ".conllu")
    for sentence in train_corpus:
        for token in sentence:
            if token.deprel is None:
                continue
            dep_label = token.deprel
            if dep_label == "cc":
                conjunction = token.id
                conjunct_ = token.head
                conjunct1 = sentence[conjunct_].head
                if conjunct1 != "0":
                    for token2 in sentence:
                        if token2.head == conjunct1:
                            tokens = [token2.id]
                            ccs = []
                            a = True
                            while a:
                                a = False
                                for t in sentence:
                                    if t.head in tokens and t.id not in tokens + ccs:
                                        if t.deprel == "cc":
                                            ccs.append(t.id)
                                        else:
                                            tokens.append(t.id)
                                        a = True
                            tokens = sorted(tokens)
                            d = _change_galician_cc_mask(
                                sentence[conjunct1].upos, token2.upos,
                                int(token2.id) - int(conjunct1),
                                str(int(tokens[0]) - 1) in ccs)
                            if token2.deprel == "conj":
                                true_conjuncts.append(d)
                            else:
                                false_conjuncts.append(d)
    X = true_conjuncts + false_conjuncts
    y = [1 for x in true_conjuncts] + [0 for x in false_conjuncts]
    return X, y
Exemple #27
0
def ud_to_dict(udfile):
    data = pyconll.load_from_file(udfile)

    tree_dict = {}
    for sentence in data:
        sent_lines = sentence.source.splitlines()
        s_id = sent_lines[0]
        tree_dict[s_id] = sent_lines[1:]

    return tree_dict
Exemple #28
0
    def __init__(self, train, dev, test, use_v1, hack_v2):
        """
        Initializer
        :param train: file name of training set
        :param dev: file name of development set
        :param test: file name of test set
        :param use_v1: True if sentence is annotated using UD V1.2
        """
        self.samples = []
        train_conll = load_from_file(train)
        dev_conll = load_from_file(dev)
        test_conll = load_from_file(test)
        self.train = samples_from_conll(train_conll, use_v1, hack_v2)
        self.dev = samples_from_conll(dev_conll, use_v1, hack_v2)
        self.test = samples_from_conll(test_conll, use_v1, hack_v2)
        self.tags = get_tags(self.train + self.dev + self.test)

        self._num_pos = get_num_upos(use_v1)
        self._num_labels = get_num_rel(use_v1)
Exemple #29
0
def test_load_from_file():
    """
    Test that a CoNLL file can properly be loaded from a filename.
    """
    c = load_from_file(fixture_location('basic.conll'))
    sent = c[1]

    assert len(c) == 4
    assert len(sent) == 14
    assert sent['10'].form == 'donc'
Exemple #30
0
def test_multiword_ignore():
    """
    Test that multiword tokens are ignored and do not cause errors.
    """
    c = load_from_file(fixture_location('projectivities.conll'))

    sent = c[3]
    deps = find_nonprojective_deps(sent)

    assert deps == [(sent['16'], sent['4'])]