def _get_text(a, to_naf=False, lang='nl'): result = "\n\n".join([_normalize(a[x]) for x in ('headline', 'text')]) if to_naf: naf = KafNafParser(type="NAF") naf.header = CHeader(type=naf.type) naf.root.insert(0, naf.header.get_node()) naf.set_language(lang) naf.set_raw(result) naf.set_version("3.0") fd = CfileDesc() if 'author' in a: fd.set_author(a['author']) if 'headline' in a: fd.set_title(a['headline']) if 'date' in a: fd.set_creationtime(a['date']) if 'medium' in a: fd.set_magazine(a['medium']) if 'page' in a: fd.set_pages(str(a['page'])) if 'section' in a: fd.set_section(a['section']) naf.header.set_fileDesc(fd) naf.header.set_publicId(a['uuid']) #if 'url' in a: # naf.header.set_uri(a['url']) b = BytesIO() naf.dump(b) result = b.getvalue().decode("utf-8") return result
def map_opinion_labels(input_file,output_file,config_file): # Load the mapping from the config_file mapping = {} parser = ConfigParser.ConfigParser() parser.read(config_file) for mapped_opinion, values_in_corpus in parser.items('valid_opinions'): values = [ v for v in values_in_corpus.split(';') if v != ''] for v in values: mapping[v] = mapped_opinion del parser ################## input_kaf = KafNafParser(input_file) remove_these = [] for opinion in input_kaf.get_opinions(): exp = opinion.get_expression() polarity = exp.get_polarity() if polarity in mapping: mapped_polarity = mapping[polarity] else: opi_id = opinion.get_id() remove_these.append(opi_id) mapped_polarity = polarity exp.set_polarity(mapped_polarity) for opi_id in remove_these: input_kaf.remove_this_opinion(opi_id) input_kaf.dump(output_file)
def main(argv): conversion = "" try: opts, args = getopt.getopt(argv,"hkn",["tokaf","tonaf"]) except getopt.GetoptError: print 'could not parse options. Correct usage: \n\n kaf-naf-parser.py --tokaf --tonaf' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'test.py --tokaf --tonaf' sys.exit() elif opt in ("-k", "--tokaf"): conversion = "to-kaf" elif opt in ("-n", "--tonaf"): conversion = "to-naf" if conversion == "": conversion = "kaf-naf" obj = KafNafParser(sys.stdin) if conversion == "to-kaf": obj.to_kaf() if conversion == "to-naf": obj.to_naf() obj.dump()
def map_opinion_labels(input_file, output_file, config_file): # Load the mapping from the config_file mapping = {} parser = ConfigParser.ConfigParser() parser.read(config_file) for mapped_opinion, values_in_corpus in parser.items('valid_opinions'): values = [v for v in values_in_corpus.split(';') if v != ''] for v in values: mapping[v] = mapped_opinion del parser ################## input_kaf = KafNafParser(input_file) remove_these = [] for opinion in input_kaf.get_opinions(): exp = opinion.get_expression() polarity = exp.get_polarity() if polarity in mapping: mapped_polarity = mapping[polarity] else: opi_id = opinion.get_id() remove_these.append(opi_id) mapped_polarity = polarity exp.set_polarity(mapped_polarity) for opi_id in remove_these: input_kaf.remove_this_opinion(opi_id) input_kaf.dump(output_file)
def corenlp2naf(xml_bytes, annotators): """ Call from on the text and return a Naf object """ naf = KafNafParser(type="NAF") try: doc = Document(xml_bytes) except: log.exception("Error on parsing xml") raise terms = {} # (xml_sentid, xml_tokenid) : term for sent in doc.sentences: for t in sent.tokens: wf = naf.create_wf(t.word, sent.id, t.character_offset_begin) term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf]) terms[sent.id, t.id] = term if t.ner not in (None, 'O'): naf.create_entity(t.ner, [term.get_id()]) if sent.collapsed_ccprocessed_dependencies: dependencies = True for dep in sent.collapsed_ccprocessed_dependencies.links: if dep.type != 'root': child = terms[sent.id, dep.dependent.idx] parent = terms[sent.id, dep.governor.idx] comment = "{t}({o}, {s})".format(s=child.get_lemma(), t=dep.type, o=parent.get_lemma()) naf.create_dependency(child.get_id(), parent.get_id(), dep.type, comment=comment) if doc.coreferences: for coref in doc.coreferences: cterms = set() for m in coref.mentions: cterms |= { terms[m.sentence.id, t.id].get_id() for t in m.tokens } naf.create_coreference("term", cterms) for annotator in annotators: if annotator in LAYERMAP: naf.create_linguistic_processor( LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()), get_corenlp_version()) s = BytesIO() naf.dump(s) return s.getvalue()
def corenlp2naf(xml_bytes, annotators): """ Call from on the text and return a Naf object """ naf = KafNafParser(type="NAF") try: doc = Document(xml_bytes) except: log.exception("Error on parsing xml") raise terms = {} # (xml_sentid, xml_tokenid) : term for sent in doc.sentences: for t in sent.tokens: wf = naf.create_wf(t.word, sent.id, t.character_offset_begin) term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf]) terms[sent.id, t.id] = term if t.ner not in (None, 'O'): naf.create_entity(t.ner, [term.get_id()]) if sent.collapsed_ccprocessed_dependencies: dependencies = True for dep in sent.collapsed_ccprocessed_dependencies.links: if dep.type != 'root': child = terms[sent.id, dep.dependent.idx] parent = terms[sent.id, dep.governor.idx] comment = "{t}({o}, {s})".format(s=child.get_lemma(), t=dep.type, o=parent.get_lemma()) naf.create_dependency(child.get_id(), parent.get_id(), dep.type, comment=comment) if doc.coreferences: for coref in doc.coreferences: cterms = set() for m in coref.mentions: cterms |= {terms[m.sentence.id, t.id].get_id() for t in m.tokens} naf.create_coreference("term", cterms) for annotator in annotators: if annotator in LAYERMAP: naf.create_linguistic_processor(LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()), get_corenlp_version()) s = BytesIO() naf.dump(s) return s.getvalue()
def test_dump(): """ Can we use naf.dump() to stdout and file? Make sure the run with nosetests -s, otherwise python3 will err """ naf = KafNafParser(type="NAF") token = naf.create_wf("\xd8lleg\xe5rd", 1, 1) expected = '<![CDATA[\xd8lleg\xe5rd]]></wf>' # do we get an error on dumping to stdout without redirect? naf.dump() # Can we dump to stdout? with capture_stdout() as s: naf.dump() output = s.getvalue().decode("utf-8") assert_in(expected, output) # Can we dump to a named file? f = tempfile.NamedTemporaryFile(suffix=".xml", delete=False) try: naf.dump(f.name) f.close() output = open(f.name, mode='rb').read().decode('utf-8') finally: os.remove(f.name) assert_in(expected, output)
def test_header(): """ Do the functions to set header attributes work correctly? Make sure the run with nosetests -s, otherwise python3 will err """ naf = KafNafParser(type="NAF") naf.header = CHeader(type=naf.type) naf.root.insert(0, naf.header.get_node()) naf.header.set_uri("http://example.com") assert_equal("http://example.com", naf.header.get_uri()) naf.header.set_publicId("123") assert_equal("123", naf.header.get_publicId()) # test if properties are serialized/deserialized correctly b = BytesIO() naf.dump(b) b.seek(0) naf2 = KafNafParser(b, type="NAF") assert_equal("http://example.com", naf2.header.get_uri()) assert_equal("123", naf2.header.get_publicId())
if target_term_id == arg_head: head_target = Ctarget() head_target.set_id(target_term_id) head_target.set_head('yes') role_span.add_target(head_target) else: role_span.add_target_id('t_' + str(arg_token)) role.set_span(role_span) # Here you add a role to an existing predicate for predicate in my_parser.get_predicates(): for span in predicate.get_span(): #print items[4], 'test',span.get_id(), items[0] if span.get_id() == pred_id: predicate.add_role(role) predicate_switch = 1 break # or you create a new predicate if predicate_switch == 0: new_predicate = Cpredicate() pred_counter = pred_counter + 1 new_predicate.set_id('pr' + str(pred_counter)) predicate_span = Cspan() predicate_span.add_target_id(pred_id) new_predicate.set_span(predicate_span) new_predicate.add_role(role) my_parser.add_predicate(new_predicate) # and you print the whole thing to a file my_parser.dump()
if id_new not in not_use: new_lemma = '' for tokenid in span: if len(terms_for_token[tokenid]) > 1: new_lemma += (''.join(data[t][2] for t in terms_for_token[tokenid])).lower() not_use |= set(terms_for_token[tokenid]) if new_lemma != '': lemma = new_lemma ############### new_term = Cterm(type=input_obj.get_type()) new_term.set_id(id_new) new_term.set_type(type_term) new_term.set_pos(pos_kaf) new_term.set_morphofeat(pos) new_term.set_lemma(lemma) term_span = Cspan() term_span.create_from_ids(span) new_term.set_span(term_span) input_obj.add_term(new_term) ##End for each sentence my_lp = Clp() my_lp.set_name('Treetagger model'+model) my_lp.set_version(__version__) my_lp.set_timestamp() input_obj.add_linguistic_processor('term', my_lp) input_obj.dump(sys.stdout)
files = [] fd = open('nl.list.test') for line in fd: files.append(line.strip()) fd.close() my_polarity_classifier = PolarityClassifier('nl') my_polarity_classifier.load_models(sys.argv[1]) OK = WR = 1 for example_file in files: this_obj = KafNafParser(example_file) my_polarity_classifier.classify_kaf_naf_object(this_obj) this_obj.dump() break GOLD = {} list_ids_term_ids = [] for opinion in this_obj.get_opinions(): op_exp = opinion.get_expression() polarity = op_exp.get_polarity() term_ids = op_exp.get_span().get_span_ids() list_ids_term_ids.append((opinion.get_id(), term_ids)) GOLD[opinion.get_id()] = polarity class_for_opinion_id, features_for_opinion_id = my_polarity_classifier.classify_list_opinions( this_obj, list_ids_term_ids) for oid, c in list(class_for_opinion_id.items()):
synset_for_skey[fields[0]] = fields[1] fd.close() naf_obj = KafNafParser(sys.stdin) for term in naf_obj.get_terms(): this_skey = None this_synset = None ref_skey = ref_synset = None for ext_ref in term.get_external_references(): if ext_ref.get_reftype() == 'sense': this_skey = ext_ref.get_reference() ref_skey = ext_ref if ext_ref.get_reftype() == 'ilidef': this_synset = ext_ref.get_reference() ref_synset = ext_ref if this_synset == '': print >> sys.stderr, term.get_id() if '%3:' in this_skey: this_skey = this_skey.replace('%3:', '%5:') elif '%5:' in this_skey: this_skey = this_skey.replace('%5:', '%3:') this_synset = synset_for_skey.get(this_skey) if this_synset is not None: ref_skey.set_reference(this_skey) ref_synset.set_reference('ili-30-%s-a' % this_synset) naf_obj.dump()
for term in obj.get_terms(): source_synset = None for ext_ref in term.get_external_references(): if ext_ref.get_resource( ) == args.input_res_label and ext_ref.get_reftype() == 'synset': source_synset = ext_ref.get_reference() break if source_synset is not None: fields = source_synset.split('-') this_synset = fields[1] short_pos = fields[2] if short_pos == 'a': this_pos = ADJ elif short_pos == 'n': this_pos = NOUN elif short_pos == 'r': this_pos = ADV elif short_pos == 'v': this_pos = VERB else: this_pos = None if this_pos is not None: target_synset = mapping[this_pos].get(this_synset) if target_synset is not None: full_reference = 'eng%s-%s-%s' % (args.output_version, target_synset, short_pos) new_ref = CexternalReference() new_ref.set_reference(full_reference) new_ref.set_confidence('1.0') new_ref.set_reftype('synset') new_ref.set_resource(args.output_res_label) term.add_external_reference(new_ref) obj.dump(args.output_file)
files = [] fd = open('nl.list.test') for line in fd: files.append(line.strip()) fd.close() my_polarity_classifier = PolarityClassifier('nl') my_polarity_classifier.load_models(sys.argv[1]) OK = WR = 1 for example_file in files: this_obj = KafNafParser(example_file) my_polarity_classifier.classify_kaf_naf_object(this_obj) this_obj.dump() break GOLD = {} list_ids_term_ids = [] for opinion in this_obj.get_opinions(): op_exp = opinion.get_expression() polarity = op_exp.get_polarity() term_ids = op_exp.get_span().get_span_ids() list_ids_term_ids.append((opinion.get_id(),term_ids)) GOLD[opinion.get_id()] = polarity