def _get_text(a, to_naf=False, lang='nl'): result = "\n\n".join([_normalize(a[x]) for x in ('headline', 'text')]) if to_naf: naf = KafNafParser(type="NAF") naf.header = CHeader(type=naf.type) naf.root.insert(0, naf.header.get_node()) naf.set_language(lang) naf.set_raw(result) naf.set_version("3.0") fd = CfileDesc() if 'author' in a: fd.set_author(a['author']) if 'headline' in a: fd.set_title(a['headline']) if 'date' in a: fd.set_creationtime(a['date']) if 'medium' in a: fd.set_magazine(a['medium']) if 'page' in a: fd.set_pages(str(a['page'])) if 'section' in a: fd.set_section(a['section']) naf.header.set_fileDesc(fd) naf.header.set_publicId(a['uuid']) #if 'url' in a: # naf.header.set_uri(a['url']) b = BytesIO() naf.dump(b) result = b.getvalue().decode("utf-8") return result
def create_naf(text): naf = KafNafParser(type="NAF") naf.set_version("3.0") naf.set_language("nl") naf.lang = "nl" naf.raw = text naf.set_raw(naf.raw) return naf
def get_naf_from_sentences(sentences): naf_obj = KafNafParser(type="NAF") naf_obj.set_version("3.0") naf_obj.set_language("nl") naf_obj.lang = "nl" naf_obj.raw = '\n'.join([' '.join(s) for s in sentences]) naf_obj.set_raw(naf_obj.raw) # Create text layer wcount = 1 offsets = {} txt = naf_obj.get_raw() token_ids = [] for sid, sentence in enumerate(sentences): token_ids_sub = [] for token in sentence: token_obj = KafNafParserPy.Cwf(type=naf_obj.get_type()) token_id = 'w{}'.format(wcount) token_length = len(token) offsets[wcount] = txt.find(token, offsets.get(wcount - 1, 0)) token_obj.set_id(token_id) token_obj.set_length(str(token_length)) # token_obj.set_offset(str(offset)) # Is this correct???? token_obj.set_para('1') token_obj.set_sent(str(sid + 1)) token_obj.set_text(token) token_obj.set_offset(str(offsets[wcount])) token_ids_sub.append(token_id) wcount += 1 naf_obj.add_wf(token_obj) token_ids.append(token_ids_sub) # Create term layers term_ids = [] count_terms = 0 for sid, (sentence, token_ids_sub) in enumerate(zip(sentences, token_ids)): term_ids_sub = [] logger.info('Creating the term layer...') for num_token, (token, token_id) in enumerate(zip(sentence, token_ids_sub)): new_term_id = 't_' + str(count_terms) count_terms += 1 term_ids_sub.append(new_term_id) term_obj = KafNafParserPy.Cterm(type=naf_obj.get_type()) term_obj.set_id(new_term_id) new_span = KafNafParserPy.Cspan() new_span.create_from_ids([token_id]) term_obj.set_span(new_span) naf_obj.add_term(term_obj) term_ids.append(term_ids_sub) return naf_obj, term_ids
def get_naf(input_filename): try: naf = KafNafParser(input_filename) except XMLSyntaxError: with open(input_filename) as input_file: input = input_file.read() if "<NAF" in input and "</NAF>" in input: # I'm guessing this should be a NAF file but something is wrong logger.exception("Error parsing NAF file") raise naf = KafNafParser(type="NAF") naf.set_version("3.0") naf.set_language("nl") naf.lang = "nl" naf.raw = input naf.set_raw(naf.raw) return naf
def get_naf(input_file): input = input_file.read() try: naf = KafNafParser(BytesIO(input)) except XMLSyntaxError: input = input.decode("utf-8") if "<NAF" in input and "</NAF>" in input: # I'm guessing this should be a NAF file but something is wrong logging.exception("Error parsing NAF file") raise naf = KafNafParser(type="NAF") naf.set_version("3.0") naf.set_language("nl") naf.lang = "nl" naf.raw = input naf.set_raw(naf.raw) return naf