def create_naf(text): naf = KafNafParser(type="NAF") naf.set_version("3.0") naf.set_language("nl") naf.lang = "nl" naf.raw = text naf.set_raw(naf.raw) return naf
def get_naf_from_sentences(sentences): naf_obj = KafNafParser(type="NAF") naf_obj.set_version("3.0") naf_obj.set_language("nl") naf_obj.lang = "nl" naf_obj.raw = '\n'.join([' '.join(s) for s in sentences]) naf_obj.set_raw(naf_obj.raw) # Create text layer wcount = 1 offsets = {} txt = naf_obj.get_raw() token_ids = [] for sid, sentence in enumerate(sentences): token_ids_sub = [] for token in sentence: token_obj = KafNafParserPy.Cwf(type=naf_obj.get_type()) token_id = 'w{}'.format(wcount) token_length = len(token) offsets[wcount] = txt.find(token, offsets.get(wcount - 1, 0)) token_obj.set_id(token_id) token_obj.set_length(str(token_length)) # token_obj.set_offset(str(offset)) # Is this correct???? token_obj.set_para('1') token_obj.set_sent(str(sid + 1)) token_obj.set_text(token) token_obj.set_offset(str(offsets[wcount])) token_ids_sub.append(token_id) wcount += 1 naf_obj.add_wf(token_obj) token_ids.append(token_ids_sub) # Create term layers term_ids = [] count_terms = 0 for sid, (sentence, token_ids_sub) in enumerate(zip(sentences, token_ids)): term_ids_sub = [] logger.info('Creating the term layer...') for num_token, (token, token_id) in enumerate(zip(sentence, token_ids_sub)): new_term_id = 't_' + str(count_terms) count_terms += 1 term_ids_sub.append(new_term_id) term_obj = KafNafParserPy.Cterm(type=naf_obj.get_type()) term_obj.set_id(new_term_id) new_span = KafNafParserPy.Cspan() new_span.create_from_ids([token_id]) term_obj.set_span(new_span) naf_obj.add_term(term_obj) term_ids.append(term_ids_sub) return naf_obj, term_ids
def get_naf(input_filename): try: naf = KafNafParser(input_filename) except XMLSyntaxError: with open(input_filename) as input_file: input = input_file.read() if "<NAF" in input and "</NAF>" in input: # I'm guessing this should be a NAF file but something is wrong logger.exception("Error parsing NAF file") raise naf = KafNafParser(type="NAF") naf.set_version("3.0") naf.set_language("nl") naf.lang = "nl" naf.raw = input naf.set_raw(naf.raw) return naf
def get_naf(input_file): input = input_file.read() try: naf = KafNafParser(BytesIO(input)) except XMLSyntaxError: input = input.decode("utf-8") if "<NAF" in input and "</NAF>" in input: # I'm guessing this should be a NAF file but something is wrong logging.exception("Error parsing NAF file") raise naf = KafNafParser(type="NAF") naf.set_version("3.0") naf.set_language("nl") naf.lang = "nl" naf.raw = input naf.set_raw(naf.raw) return naf