コード例 #1
0
def _test_file(this_file):
    input_fd = open(this_file)
    
    result = subprocess.check_output(os.path.join(__here__,'run_parser.sh'), stdin=input_fd)
    my_obj = KafNafParser(BytesIO(result))
       
    
    #Check the terms
    terms = [term for term in my_obj.get_terms()]
    assert_equal(len(terms),12)
    assert_equal(my_obj.get_term('t_4').get_lemma(),'mooi')
    assert_equal(my_obj.get_term('t_4').get_pos(),'adj')
    
    
    #Check constituents
    trees = [tree for tree in my_obj.get_trees()]
    assert_equal(len(trees),2)
    assert_equal(trees[0].get_terminals_as_list()[1].get_span().get_span_ids(),['t_1'])
    
    #Check dependencies
    dependencies = [dep for dep in my_obj.get_dependencies()]
    assert_equal(len(dependencies),10)
    assert_equal(dependencies[5].get_function(),'hd/su')
コード例 #2
0
def get_terms_in_sentence(naf: KafNafParser, sent: int) -> Iterable[Cterm]:
    tokens = sort_tokens(t for t in naf.get_tokens() if t.get_sent() == sent)
    tokenids = [t.get_id() for t in tokens]
    return sort_terms(
        naf, [naf.get_term(tid) for tid in naf.map_tokens_to_terms(tokenids)])
コード例 #3
0
				dep_patterns[dep_parts[0]] = []
				dep_patterns[dep_parts[0]].append(dep_parts[1].strip('t_'))
			else:
				dep_patterns[dep_parts[0]].append(dep_parts[1].strip('t_'))

# Then we find the dependencies between two predicates
# we store the main verb as the key and the auxiliary verb as the value 
auxiliary_verbs = {}
for pattern in dep_patterns:
	sorted_dep_patterns = list(set(dep_patterns[pattern]))
	sorted_dep_patterns.sort(key=int)
	for sorted in sorted_dep_patterns:
		if predicates.get(('t_' + str(sorted))) is not None:
			pattern_term_number = int(pattern.strip('t_'))
			if int(sorted) != int(pattern_term_number):
				if my_parser.get_term(pattern).get_lemma() == 'zijn':
					auxiliary_verbs['t_' + str(pattern_term_number)] = 't_'+ str(sorted)
				else:
					auxiliary_verbs['t_'+ str(sorted)] = 't_' + str(pattern_term_number)

# The pos tags are actually the first part of the morphofeat attributes 
# Here they are rewritten to that format 
def rewrite_postag(term):
	morphofeat = my_parser.get_term(term).get_morphofeat()
	parts = morphofeat.split(',')
	postag = (parts[0] + ')')
	postag = postag.replace('))', ')')
	return(postag)		
	
# The argument categories are pulled out of the constituency trees
non_terminals = {}
コード例 #4
0
            my_parser = KafNafParser(f)  #create the KafNafParser
            for entity in my_parser.get_entities(
            ):  #get the entities identified through NER
                if entity.get_type(
                ) == "LOCATION":  #if it's a location, then let's get its term references
                    entity_type = entity.get_type()  #get the entity type
                    entity_id = entity.get_id()  #get the entity id
                    ref = entity.get_node().find(
                        "references"
                    )  #find the references node in the KAF file
                    targets = ref.find("span").findall(
                        "target"
                    )  #get a list of all the targets in the references node
                    term_id = targets[0].attrib[
                        "id"]  #get the first target term id, so we can generate the term text in the next two lines
                    word_id = my_parser.get_term(term_id).get_node().find(
                        "span").find("target").attrib["id"]  #get the word id
                    word = my_parser.get_token(
                        word_id).get_text()  #get the word text

                    #simpledict[word]=freqdict[word] #generate a simple tally that ignores whether each LOCATION is valid;
                    #commented out since OpenRefine will be used to validate

                    sentence = ""  #start a blank KWIC sentence variable
                    for target in targets:  #iterate through all targets in the references element for the entity node
                        tid = int(
                            my_parser.get_term(
                                target.attrib["id"]).get_node().find(
                                    "span").find("target").attrib["id"]
                            [1:])  #get the token id for the target
                        for x in range(
                                tid - 15, tid + 15
コード例 #5
0
import csv
import sys
from KafNafParserPy import KafNafParser

from naflib import *
from naflib import sort_terms

woorden = [r['original'] for r in csv.DictReader(open("klimaatwoorden.csv"))]

o = csv.writer(sys.stdout)
o.writerow(["file", "sentence", "entity", "type", "dbpedia", "text"])
for fn in sys.argv[1:]:
    naf = KafNafParser(fn)
    for e in naf.get_entities():
        for ref in e.get_references():
            terms = sort_terms(
                naf, [naf.get_term(t.get_id()) for t in ref.get_span()])
            o.writerow([
                fn,
                get_sentence(naf, terms[0]),
                e.get_id(),
                e.get_type(), " ".join(t.get_lemma() for t in terms)
            ])
コード例 #6
0
        role_number = role.get_id()[1:]
        if int(role_number) > role_index:
            role_index = int(role_number)

depextractor = my_parser.get_dependency_extractor()
deps = depextractor.relations_for_term

# Extract nominal predicates from SRL layer
for pred in my_parser.get_predicates():
    #revision version 2: check if the predicate has any roles
    if len(pred.node.findall('role')) == 0:
        #retrieve the predicate's span

        for span_obj in pred.get_span():
            term_id = span_obj.get_id()
            term = my_parser.get_term(term_id)

            #Version 2 changes:
            #1. double check if we're dealing with a nominal predicate
            #2. go through dependencies and create roles for all that are pp dependencies
            #3. if head is 'van' dependency is arg1, else it is argM
            if term.get_pos() == 'noun':
                #check modifiers of the noun
                if term_id in deps:
                    for dep in deps.get(term_id):
                        if 'hd/mod' in dep[0]:
                            modterm = my_parser.get_term(dep[1])
                            if modterm.get_pos() == 'prep':
                                if modterm.get_lemma() == 'van':
                                    my_role = 'Arg1'
                                else: