Beispiel #1
0
 def test_string_tool(self):
     self.assertEqual(StringTool.strip(None), '')
     self.assertEqual(StringTool.strip(' '), '')
     self.assertEqual(StringTool.to_str(None), '')
     # detokenize
     words = ["I", "'ll", "go", "home", "."]
     self.assertEqual(StringTool.detokenize(words), "I'll go home.")
     self.assertEqual(StringTool.detokenize(["This", "(", "thing", ")", "is", "a", "comment", "!"]), "This (thing) is a comment!")
     self.assertEqual(StringTool.detokenize("He said `` why ? '' .".split()), "He said “why?”.")
     self.assertEqual(StringTool.detokenize("Where are you ?".split()), "Where are you?")
     self.assertEqual(StringTool.detokenize("Note : It works .".split()), "Note: It works.")
     self.assertEqual(StringTool.detokenize("( A ) ; ".split()), "(A);")
     self.assertEqual(StringTool.detokenize("( A ) ; B ".split()), "(A); B")
Beispiel #2
0
def semeval_to_ttl(cli, args):
    print("Semeval file: {}".format(args.input))
    print("Semeval key file: {}".format(args.keys))
    print("TTL file: {}".format(args.output))
    print("TTL format: {}".format(args.ttl_format))
    # Read document data
    tree = etree.iterparse(args.input)
    doc = ttl.Document()
    sent_id_map = {}
    for event, element in tree:
        if event == 'end' and element.tag == 'sentence':
            # do some processing here
            sent_ident = element.get('id')
            tokens = []
            tids = []
            # docID & sentID
            docID = sent_ident[1:4]
            sent_id = sent_ident[6:9]
            wfs = []
            for wf in element:
                wident, lemma, pos, text = wf.get('id'), wf.get('lemma'), wf.get('pos'), wf.text
                wfs.append((wident, lemma, pos, text))
                wid = wident[11:]
                tokens.append(text)
                tids.append('{}/{}'.format(wid, lemma))
            sent_text = StringTool.detokenize(tokens)
            print("Doc: {} - Sent: {} - {}".format(docID, sent_id, sent_text))
            sent_obj = doc.new_sent(text=sent_text)
            sent_obj.new_tag(label=sent_ident, tagtype='origid')
            sent_id_map[sent_ident] = sent_obj
            sent_obj.tokens = tokens  # add original token in
            for (sent_token, (wident, lemma, pos, text)) in zip(sent_obj, wfs):
                sent_token.new_tag(label=wident, tagtype='origid')
                if pos:
                    sent_token.pos = pos
                if lemma:
                    sent_token.lemma = lemma
            element.clear()
    # Read tag data
    if args.keys:
        keys = chio.read_tsv(args.keys)
        wn = get_wn()
        not_found = 0
        mwe_count = 0
        # TODO Add option to split a semeval file into several documents
        for line in keys:
            from_token = line[0]
            from_token_idx = int(from_token[-3:]) - 1
            sent_id = from_token[:9]
            to_token = line[1]
            to_token_idx = int(to_token[-3:]) - 1
            if from_token != to_token:
                mwe_count += 1
                print("MWE: {}".format(line))
            bbss = line[2]
            wn_keys = [x[3:] for x in line[3:] if x.startswith('wn:')]
            found_ss = None
            for wn_key in wn_keys:
                ss = wn.get_by_key(wn_key)
                if ss is not None:
                    # print("{} => {}".format(" ".join(wn_keys), ss))
                    sent_id_map[sent_id].new_concept(tag=str(ss.ID), tokens=range(from_token_idx, to_token_idx + 1))
                    found_ss = ss
                    break
            if found_ss is None:
                getLogger().warning("Not found: {}".format(line))
                not_found += 1
        print("Total: {} - Not found: {} - MWE: {}".format(len(keys), not_found, mwe_count))
    ttl.write(args.output, doc, mode=args.ttl_format)
    print("Output file: {}".format(args.output))