def _test_file(this_file): input_fd = open(this_file) result = subprocess.check_output(os.path.join(__here__,'run_parser.sh'), stdin=input_fd) my_obj = KafNafParser(BytesIO(result)) #Check the terms terms = [term for term in my_obj.get_terms()] assert_equal(len(terms),12) assert_equal(my_obj.get_term('t_4').get_lemma(),'mooi') assert_equal(my_obj.get_term('t_4').get_pos(),'adj') #Check constituents trees = [tree for tree in my_obj.get_trees()] assert_equal(len(trees),2) assert_equal(trees[0].get_terminals_as_list()[1].get_span().get_span_ids(),['t_1']) #Check dependencies dependencies = [dep for dep in my_obj.get_dependencies()] assert_equal(len(dependencies),10) assert_equal(dependencies[5].get_function(),'hd/su')
def get_terms_in_sentence(naf: KafNafParser, sent: int) -> Iterable[Cterm]: tokens = sort_tokens(t for t in naf.get_tokens() if t.get_sent() == sent) tokenids = [t.get_id() for t in tokens] return sort_terms( naf, [naf.get_term(tid) for tid in naf.map_tokens_to_terms(tokenids)])
dep_patterns[dep_parts[0]] = [] dep_patterns[dep_parts[0]].append(dep_parts[1].strip('t_')) else: dep_patterns[dep_parts[0]].append(dep_parts[1].strip('t_')) # Then we find the dependencies between two predicates # we store the main verb as the key and the auxiliary verb as the value auxiliary_verbs = {} for pattern in dep_patterns: sorted_dep_patterns = list(set(dep_patterns[pattern])) sorted_dep_patterns.sort(key=int) for sorted in sorted_dep_patterns: if predicates.get(('t_' + str(sorted))) is not None: pattern_term_number = int(pattern.strip('t_')) if int(sorted) != int(pattern_term_number): if my_parser.get_term(pattern).get_lemma() == 'zijn': auxiliary_verbs['t_' + str(pattern_term_number)] = 't_'+ str(sorted) else: auxiliary_verbs['t_'+ str(sorted)] = 't_' + str(pattern_term_number) # The pos tags are actually the first part of the morphofeat attributes # Here they are rewritten to that format def rewrite_postag(term): morphofeat = my_parser.get_term(term).get_morphofeat() parts = morphofeat.split(',') postag = (parts[0] + ')') postag = postag.replace('))', ')') return(postag) # The argument categories are pulled out of the constituency trees non_terminals = {}
my_parser = KafNafParser(f) #create the KafNafParser for entity in my_parser.get_entities( ): #get the entities identified through NER if entity.get_type( ) == "LOCATION": #if it's a location, then let's get its term references entity_type = entity.get_type() #get the entity type entity_id = entity.get_id() #get the entity id ref = entity.get_node().find( "references" ) #find the references node in the KAF file targets = ref.find("span").findall( "target" ) #get a list of all the targets in the references node term_id = targets[0].attrib[ "id"] #get the first target term id, so we can generate the term text in the next two lines word_id = my_parser.get_term(term_id).get_node().find( "span").find("target").attrib["id"] #get the word id word = my_parser.get_token( word_id).get_text() #get the word text #simpledict[word]=freqdict[word] #generate a simple tally that ignores whether each LOCATION is valid; #commented out since OpenRefine will be used to validate sentence = "" #start a blank KWIC sentence variable for target in targets: #iterate through all targets in the references element for the entity node tid = int( my_parser.get_term( target.attrib["id"]).get_node().find( "span").find("target").attrib["id"] [1:]) #get the token id for the target for x in range( tid - 15, tid + 15
import csv import sys from KafNafParserPy import KafNafParser from naflib import * from naflib import sort_terms woorden = [r['original'] for r in csv.DictReader(open("klimaatwoorden.csv"))] o = csv.writer(sys.stdout) o.writerow(["file", "sentence", "entity", "type", "dbpedia", "text"]) for fn in sys.argv[1:]: naf = KafNafParser(fn) for e in naf.get_entities(): for ref in e.get_references(): terms = sort_terms( naf, [naf.get_term(t.get_id()) for t in ref.get_span()]) o.writerow([ fn, get_sentence(naf, terms[0]), e.get_id(), e.get_type(), " ".join(t.get_lemma() for t in terms) ])
role_number = role.get_id()[1:] if int(role_number) > role_index: role_index = int(role_number) depextractor = my_parser.get_dependency_extractor() deps = depextractor.relations_for_term # Extract nominal predicates from SRL layer for pred in my_parser.get_predicates(): #revision version 2: check if the predicate has any roles if len(pred.node.findall('role')) == 0: #retrieve the predicate's span for span_obj in pred.get_span(): term_id = span_obj.get_id() term = my_parser.get_term(term_id) #Version 2 changes: #1. double check if we're dealing with a nominal predicate #2. go through dependencies and create roles for all that are pp dependencies #3. if head is 'van' dependency is arg1, else it is argM if term.get_pos() == 'noun': #check modifiers of the noun if term_id in deps: for dep in deps.get(term_id): if 'hd/mod' in dep[0]: modterm = my_parser.get_term(dep[1]) if modterm.get_pos() == 'prep': if modterm.get_lemma() == 'van': my_role = 'Arg1' else: