Beispiel #1
0
def _get_text(a, to_naf=False, lang='nl'):
    result = "\n\n".join([_normalize(a[x]) for x in ('headline', 'text')])
    if to_naf:
        naf = KafNafParser(type="NAF")
        naf.header = CHeader(type=naf.type)
        naf.root.insert(0, naf.header.get_node())

        naf.set_language(lang)
        naf.set_raw(result)
        naf.set_version("3.0")

        fd = CfileDesc()
        if 'author' in a:
            fd.set_author(a['author'])
        if 'headline' in a:
            fd.set_title(a['headline'])
        if 'date' in a:
            fd.set_creationtime(a['date'])
        if 'medium' in a:
            fd.set_magazine(a['medium'])
        if 'page' in a:
            fd.set_pages(str(a['page']))
        if 'section' in a:
            fd.set_section(a['section'])
        naf.header.set_fileDesc(fd)

        naf.header.set_publicId(a['uuid'])
        #if 'url' in a:
        #    naf.header.set_uri(a['url'])
        b = BytesIO()
        naf.dump(b)
        result = b.getvalue().decode("utf-8")
    return result
def map_opinion_labels(input_file,output_file,config_file):
    # Load the mapping from the config_file
    mapping = {}
    parser = ConfigParser.ConfigParser()
    parser.read(config_file)
    for mapped_opinion, values_in_corpus in parser.items('valid_opinions'):
        values = [ v for v in values_in_corpus.split(';') if v != '']
        for v in values:
            mapping[v] = mapped_opinion
    del parser
    ##################        
    
    input_kaf = KafNafParser(input_file)
    remove_these = []
    for opinion in input_kaf.get_opinions():
        exp = opinion.get_expression()
        polarity = exp.get_polarity()
        if polarity in mapping:
            mapped_polarity = mapping[polarity]
        else:
            opi_id = opinion.get_id()
            remove_these.append(opi_id)
            mapped_polarity = polarity
            
        exp.set_polarity(mapped_polarity)
        
    for opi_id in remove_these:
        input_kaf.remove_this_opinion(opi_id)
    input_kaf.dump(output_file)
def main(argv):
  conversion = ""
  try:
    opts, args = getopt.getopt(argv,"hkn",["tokaf","tonaf"])
  except getopt.GetoptError:
    print 'could not parse options. Correct usage: \n\n kaf-naf-parser.py --tokaf --tonaf'
    sys.exit(2)
  for opt, arg in opts:
    if opt == '-h':
      print 'test.py --tokaf --tonaf'
      sys.exit()
    elif opt in ("-k", "--tokaf"):
      conversion = "to-kaf"
    elif opt in ("-n", "--tonaf"):
      conversion = "to-naf"

  if conversion == "":
    conversion = "kaf-naf"

  obj = KafNafParser(sys.stdin)

  if conversion == "to-kaf":
    obj.to_kaf()
  if conversion == "to-naf":
    obj.to_naf()

  obj.dump()
def map_opinion_labels(input_file, output_file, config_file):
    # Load the mapping from the config_file
    mapping = {}
    parser = ConfigParser.ConfigParser()
    parser.read(config_file)
    for mapped_opinion, values_in_corpus in parser.items('valid_opinions'):
        values = [v for v in values_in_corpus.split(';') if v != '']
        for v in values:
            mapping[v] = mapped_opinion
    del parser
    ##################

    input_kaf = KafNafParser(input_file)
    remove_these = []
    for opinion in input_kaf.get_opinions():
        exp = opinion.get_expression()
        polarity = exp.get_polarity()
        if polarity in mapping:
            mapped_polarity = mapping[polarity]
        else:
            opi_id = opinion.get_id()
            remove_these.append(opi_id)
            mapped_polarity = polarity

        exp.set_polarity(mapped_polarity)

    for opi_id in remove_these:
        input_kaf.remove_this_opinion(opi_id)
    input_kaf.dump(output_file)
Beispiel #5
0
def corenlp2naf(xml_bytes, annotators):
    """
    Call from on the text and return a Naf object
    """
    naf = KafNafParser(type="NAF")

    try:
        doc = Document(xml_bytes)
    except:
        log.exception("Error on parsing xml")
        raise

    terms = {}  # (xml_sentid, xml_tokenid) : term
    for sent in doc.sentences:
        for t in sent.tokens:
            wf = naf.create_wf(t.word, sent.id, t.character_offset_begin)
            term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf])
            terms[sent.id, t.id] = term
            if t.ner not in (None, 'O'):
                naf.create_entity(t.ner, [term.get_id()])
        if sent.collapsed_ccprocessed_dependencies:
            dependencies = True
            for dep in sent.collapsed_ccprocessed_dependencies.links:
                if dep.type != 'root':
                    child = terms[sent.id, dep.dependent.idx]
                    parent = terms[sent.id, dep.governor.idx]
                    comment = "{t}({o}, {s})".format(s=child.get_lemma(),
                                                     t=dep.type,
                                                     o=parent.get_lemma())
                    naf.create_dependency(child.get_id(),
                                          parent.get_id(),
                                          dep.type,
                                          comment=comment)

    if doc.coreferences:
        for coref in doc.coreferences:
            cterms = set()
            for m in coref.mentions:
                cterms |= {
                    terms[m.sentence.id, t.id].get_id()
                    for t in m.tokens
                }
            naf.create_coreference("term", cterms)

    for annotator in annotators:
        if annotator in LAYERMAP:
            naf.create_linguistic_processor(
                LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()),
                get_corenlp_version())
    s = BytesIO()
    naf.dump(s)
    return s.getvalue()
Beispiel #6
0
def corenlp2naf(xml_bytes, annotators):
    """
    Call from on the text and return a Naf object
    """
    naf = KafNafParser(type="NAF")

    try:
        doc = Document(xml_bytes)
    except:
        log.exception("Error on parsing xml")
        raise

    terms = {} # (xml_sentid, xml_tokenid) : term
    for sent in doc.sentences:
        for t in sent.tokens:
            wf = naf.create_wf(t.word, sent.id, t.character_offset_begin)
            term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf])
            terms[sent.id, t.id] = term
            if t.ner not in (None, 'O'):
                naf.create_entity(t.ner, [term.get_id()])
        if sent.collapsed_ccprocessed_dependencies:
            dependencies = True
            for dep in sent.collapsed_ccprocessed_dependencies.links:
                if dep.type != 'root':
                    child = terms[sent.id, dep.dependent.idx]
                    parent = terms[sent.id, dep.governor.idx]
                    comment = "{t}({o}, {s})".format(s=child.get_lemma(), t=dep.type, o=parent.get_lemma())
                    naf.create_dependency(child.get_id(), parent.get_id(), dep.type, comment=comment)

    if doc.coreferences:
        for coref in doc.coreferences:
            cterms = set()
            for m in coref.mentions:
                cterms |= {terms[m.sentence.id, t.id].get_id() for t in m.tokens}
            naf.create_coreference("term", cterms)
        
    for annotator in annotators:
        if annotator in LAYERMAP:
            naf.create_linguistic_processor(LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()),
                                            get_corenlp_version())
    s = BytesIO()
    naf.dump(s)
    return s.getvalue()
def test_dump():
    """
    Can we use naf.dump() to stdout and file?

    Make sure the run with nosetests -s, otherwise python3 will err
    """

    naf = KafNafParser(type="NAF")
    token = naf.create_wf("\xd8lleg\xe5rd", 1, 1)
    expected = '<![CDATA[\xd8lleg\xe5rd]]></wf>'

    # do we get an error on dumping to stdout without redirect?
    naf.dump()

    # Can we dump to stdout?
    with capture_stdout() as s:
        naf.dump()
    output = s.getvalue().decode("utf-8")
    assert_in(expected, output)

    # Can we dump to a named file?
    f = tempfile.NamedTemporaryFile(suffix=".xml", delete=False)
    try:
        naf.dump(f.name)
        f.close()
        output = open(f.name, mode='rb').read().decode('utf-8')
    finally:
        os.remove(f.name)
    assert_in(expected, output)
Beispiel #8
0
def test_dump():
    """
    Can we use naf.dump() to stdout and file?

    Make sure the run with nosetests -s, otherwise python3 will err
    """

    naf = KafNafParser(type="NAF")
    token = naf.create_wf("\xd8lleg\xe5rd", 1, 1)
    expected = '<![CDATA[\xd8lleg\xe5rd]]></wf>'

    # do we get an error on dumping to stdout without redirect?
    naf.dump()
    
    # Can we dump to stdout?
    with capture_stdout() as s:
        naf.dump()
    output = s.getvalue().decode("utf-8")
    assert_in(expected, output)
    
    # Can we dump to a named file?
    f = tempfile.NamedTemporaryFile(suffix=".xml", delete=False)
    try:
        naf.dump(f.name)
        f.close()
        output = open(f.name, mode='rb').read().decode('utf-8')
    finally:
        os.remove(f.name)
    assert_in(expected, output)
Beispiel #9
0
def test_header():
    """
    Do the functions to set header attributes work correctly?

    Make sure the run with nosetests -s, otherwise python3 will err
    """

    naf = KafNafParser(type="NAF")
    naf.header = CHeader(type=naf.type)
    naf.root.insert(0, naf.header.get_node())

    naf.header.set_uri("http://example.com")
    assert_equal("http://example.com", naf.header.get_uri())
    naf.header.set_publicId("123")
    assert_equal("123", naf.header.get_publicId())

    # test if properties are serialized/deserialized correctly
    b = BytesIO()
    naf.dump(b)
    b.seek(0)
    naf2 = KafNafParser(b, type="NAF")
    assert_equal("http://example.com", naf2.header.get_uri())
    assert_equal("123", naf2.header.get_publicId())
Beispiel #10
0
def test_header():
    """
    Do the functions to set header attributes work correctly?

    Make sure the run with nosetests -s, otherwise python3 will err
    """

    naf = KafNafParser(type="NAF")
    naf.header = CHeader(type=naf.type)
    naf.root.insert(0, naf.header.get_node())

    naf.header.set_uri("http://example.com")
    assert_equal("http://example.com", naf.header.get_uri())
    naf.header.set_publicId("123")
    assert_equal("123", naf.header.get_publicId())

    # test if properties are serialized/deserialized correctly
    b = BytesIO()
    naf.dump(b)
    b.seek(0)
    naf2 = KafNafParser(b, type="NAF")
    assert_equal("http://example.com", naf2.header.get_uri())
    assert_equal("123", naf2.header.get_publicId())
Beispiel #11
0
                if target_term_id == arg_head:
                    head_target = Ctarget()
                    head_target.set_id(target_term_id)
                    head_target.set_head('yes')
                    role_span.add_target(head_target)
                else:
                    role_span.add_target_id('t_' + str(arg_token))
        role.set_span(role_span)

        # Here you add a role to an existing predicate
        for predicate in my_parser.get_predicates():
            for span in predicate.get_span():
                #print items[4], 'test',span.get_id(), items[0]
                if span.get_id() == pred_id:
                    predicate.add_role(role)
                    predicate_switch = 1
                    break
        # or you create a new predicate
        if predicate_switch == 0:
            new_predicate = Cpredicate()
            pred_counter = pred_counter + 1
            new_predicate.set_id('pr' + str(pred_counter))
            predicate_span = Cspan()
            predicate_span.add_target_id(pred_id)
            new_predicate.set_span(predicate_span)
            new_predicate.add_role(role)
            my_parser.add_predicate(new_predicate)

# and you print the whole thing to a file
my_parser.dump()
			if id_new not in not_use:
				new_lemma = ''
				for tokenid in span:
					if len(terms_for_token[tokenid]) > 1:
						new_lemma += (''.join(data[t][2] for t in terms_for_token[tokenid])).lower()
						not_use |= set(terms_for_token[tokenid])
				if new_lemma != '':
					lemma = new_lemma

				###############
				new_term = Cterm(type=input_obj.get_type())
				new_term.set_id(id_new)
				new_term.set_type(type_term)
				new_term.set_pos(pos_kaf)
				new_term.set_morphofeat(pos)
				new_term.set_lemma(lemma)
				term_span = Cspan()
				term_span.create_from_ids(span)
				new_term.set_span(term_span)
				input_obj.add_term(new_term)
	##End for each sentence

	my_lp = Clp()
	my_lp.set_name('Treetagger model'+model)
	my_lp.set_version(__version__)
	my_lp.set_timestamp()
	input_obj.add_linguistic_processor('term', my_lp)
	input_obj.dump(sys.stdout)


Beispiel #13
0
    files = []
    fd = open('nl.list.test')
    for line in fd:
        files.append(line.strip())
    fd.close()

    my_polarity_classifier = PolarityClassifier('nl')
    my_polarity_classifier.load_models(sys.argv[1])

    OK = WR = 1
    for example_file in files:
        this_obj = KafNafParser(example_file)

        my_polarity_classifier.classify_kaf_naf_object(this_obj)
        this_obj.dump()

        break

        GOLD = {}
        list_ids_term_ids = []
        for opinion in this_obj.get_opinions():
            op_exp = opinion.get_expression()
            polarity = op_exp.get_polarity()
            term_ids = op_exp.get_span().get_span_ids()
            list_ids_term_ids.append((opinion.get_id(), term_ids))
            GOLD[opinion.get_id()] = polarity

        class_for_opinion_id, features_for_opinion_id = my_polarity_classifier.classify_list_opinions(
            this_obj, list_ids_term_ids)
        for oid, c in list(class_for_opinion_id.items()):
Beispiel #14
0
        synset_for_skey[fields[0]] = fields[1]
    fd.close()

    naf_obj = KafNafParser(sys.stdin)

    for term in naf_obj.get_terms():
        this_skey = None
        this_synset = None
        ref_skey = ref_synset = None
        for ext_ref in term.get_external_references():
            if ext_ref.get_reftype() == 'sense':
                this_skey = ext_ref.get_reference()
                ref_skey = ext_ref
            if ext_ref.get_reftype() == 'ilidef':
                this_synset = ext_ref.get_reference()
                ref_synset = ext_ref

        if this_synset == '':
            print >> sys.stderr, term.get_id()
            if '%3:' in this_skey:
                this_skey = this_skey.replace('%3:', '%5:')
            elif '%5:' in this_skey:
                this_skey = this_skey.replace('%5:', '%3:')

            this_synset = synset_for_skey.get(this_skey)
            if this_synset is not None:
                ref_skey.set_reference(this_skey)
                ref_synset.set_reference('ili-30-%s-a' % this_synset)

    naf_obj.dump()
Beispiel #15
0
    for term in obj.get_terms():
        source_synset = None
        for ext_ref in term.get_external_references():
            if ext_ref.get_resource(
            ) == args.input_res_label and ext_ref.get_reftype() == 'synset':
                source_synset = ext_ref.get_reference()
                break
        if source_synset is not None:
            fields = source_synset.split('-')
            this_synset = fields[1]
            short_pos = fields[2]
            if short_pos == 'a': this_pos = ADJ
            elif short_pos == 'n': this_pos = NOUN
            elif short_pos == 'r': this_pos = ADV
            elif short_pos == 'v': this_pos = VERB
            else: this_pos = None

            if this_pos is not None:
                target_synset = mapping[this_pos].get(this_synset)
                if target_synset is not None:
                    full_reference = 'eng%s-%s-%s' % (args.output_version,
                                                      target_synset, short_pos)
                    new_ref = CexternalReference()
                    new_ref.set_reference(full_reference)
                    new_ref.set_confidence('1.0')
                    new_ref.set_reftype('synset')
                    new_ref.set_resource(args.output_res_label)
                    term.add_external_reference(new_ref)

    obj.dump(args.output_file)
    files = []
    fd = open('nl.list.test')
    for line in fd:
        files.append(line.strip())
    fd.close()

    my_polarity_classifier = PolarityClassifier('nl')
    my_polarity_classifier.load_models(sys.argv[1])

    OK = WR = 1
    for example_file in files:
        this_obj = KafNafParser(example_file)
        
        
        my_polarity_classifier.classify_kaf_naf_object(this_obj)
        this_obj.dump()

        break
    
        GOLD = {}
        list_ids_term_ids = []
        for opinion in this_obj.get_opinions():
            op_exp = opinion.get_expression()
            polarity = op_exp.get_polarity()
            term_ids = op_exp.get_span().get_span_ids()
            list_ids_term_ids.append((opinion.get_id(),term_ids))
            GOLD[opinion.get_id()] = polarity