def test_string_tool(self): self.assertEqual(StringTool.strip(None), '') self.assertEqual(StringTool.strip(' '), '') self.assertEqual(StringTool.to_str(None), '') # detokenize words = ["I", "'ll", "go", "home", "."] self.assertEqual(StringTool.detokenize(words), "I'll go home.") self.assertEqual(StringTool.detokenize(["This", "(", "thing", ")", "is", "a", "comment", "!"]), "This (thing) is a comment!") self.assertEqual(StringTool.detokenize("He said `` why ? '' .".split()), "He said “why?”.") self.assertEqual(StringTool.detokenize("Where are you ?".split()), "Where are you?") self.assertEqual(StringTool.detokenize("Note : It works .".split()), "Note: It works.") self.assertEqual(StringTool.detokenize("( A ) ; ".split()), "(A);") self.assertEqual(StringTool.detokenize("( A ) ; B ".split()), "(A); B")
def semeval_to_ttl(cli, args): print("Semeval file: {}".format(args.input)) print("Semeval key file: {}".format(args.keys)) print("TTL file: {}".format(args.output)) print("TTL format: {}".format(args.ttl_format)) # Read document data tree = etree.iterparse(args.input) doc = ttl.Document() sent_id_map = {} for event, element in tree: if event == 'end' and element.tag == 'sentence': # do some processing here sent_ident = element.get('id') tokens = [] tids = [] # docID & sentID docID = sent_ident[1:4] sent_id = sent_ident[6:9] wfs = [] for wf in element: wident, lemma, pos, text = wf.get('id'), wf.get('lemma'), wf.get('pos'), wf.text wfs.append((wident, lemma, pos, text)) wid = wident[11:] tokens.append(text) tids.append('{}/{}'.format(wid, lemma)) sent_text = StringTool.detokenize(tokens) print("Doc: {} - Sent: {} - {}".format(docID, sent_id, sent_text)) sent_obj = doc.new_sent(text=sent_text) sent_obj.new_tag(label=sent_ident, tagtype='origid') sent_id_map[sent_ident] = sent_obj sent_obj.tokens = tokens # add original token in for (sent_token, (wident, lemma, pos, text)) in zip(sent_obj, wfs): sent_token.new_tag(label=wident, tagtype='origid') if pos: sent_token.pos = pos if lemma: sent_token.lemma = lemma element.clear() # Read tag data if args.keys: keys = chio.read_tsv(args.keys) wn = get_wn() not_found = 0 mwe_count = 0 # TODO Add option to split a semeval file into several documents for line in keys: from_token = line[0] from_token_idx = int(from_token[-3:]) - 1 sent_id = from_token[:9] to_token = line[1] to_token_idx = int(to_token[-3:]) - 1 if from_token != to_token: mwe_count += 1 print("MWE: {}".format(line)) bbss = line[2] wn_keys = [x[3:] for x in line[3:] if x.startswith('wn:')] found_ss = None for wn_key in wn_keys: ss = wn.get_by_key(wn_key) if ss is not None: # print("{} => {}".format(" ".join(wn_keys), ss)) sent_id_map[sent_id].new_concept(tag=str(ss.ID), tokens=range(from_token_idx, to_token_idx + 1)) found_ss = ss break if found_ss is None: getLogger().warning("Not found: {}".format(line)) not_found += 1 print("Total: {} - Not found: {} - MWE: {}".format(len(keys), not_found, mwe_count)) ttl.write(args.output, doc, mode=args.ttl_format) print("Output file: {}".format(args.output))