Python StringTool.detokenize Examples

Programming Language: Python

Namespace/Package Name: chirptext.leutile

Class/Type: StringTool

Method/Function: detokenize

Examples at hotexamples.com: 2

Python StringTool.detokenize - 2 examples found. These are the top rated real world Python examples of chirptext.leutile.StringTool.detokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

strip(8)

detokenize(2)

to_str(1)

Frequently Used Methods

strip (8)

detokenize (2)

to_str (1)

Example #1

Show file

File: test_leutile.py Project: letuananh/chirptext

 def test_string_tool(self):
     self.assertEqual(StringTool.strip(None), '')
     self.assertEqual(StringTool.strip(' '), '')
     self.assertEqual(StringTool.to_str(None), '')
     # detokenize
     words = ["I", "'ll", "go", "home", "."]
     self.assertEqual(StringTool.detokenize(words), "I'll go home.")
     self.assertEqual(StringTool.detokenize(["This", "(", "thing", ")", "is", "a", "comment", "!"]), "This (thing) is a comment!")
     self.assertEqual(StringTool.detokenize("He said `` why ? '' .".split()), "He said “why?”.")
     self.assertEqual(StringTool.detokenize("Where are you ?".split()), "Where are you?")
     self.assertEqual(StringTool.detokenize("Note : It works .".split()), "Note: It works.")
     self.assertEqual(StringTool.detokenize("( A ) ; ".split()), "(A);")
     self.assertEqual(StringTool.detokenize("( A ) ; B ".split()), "(A); B")

Example #2

Show file

File: ttl.py Project: letuananh/intsem.fx

def semeval_to_ttl(cli, args):
    print("Semeval file: {}".format(args.input))
    print("Semeval key file: {}".format(args.keys))
    print("TTL file: {}".format(args.output))
    print("TTL format: {}".format(args.ttl_format))
    # Read document data
    tree = etree.iterparse(args.input)
    doc = ttl.Document()
    sent_id_map = {}
    for event, element in tree:
        if event == 'end' and element.tag == 'sentence':
            # do some processing here
            sent_ident = element.get('id')
            tokens = []
            tids = []
            # docID & sentID
            docID = sent_ident[1:4]
            sent_id = sent_ident[6:9]
            wfs = []
            for wf in element:
                wident, lemma, pos, text = wf.get('id'), wf.get('lemma'), wf.get('pos'), wf.text
                wfs.append((wident, lemma, pos, text))
                wid = wident[11:]
                tokens.append(text)
                tids.append('{}/{}'.format(wid, lemma))
            sent_text = StringTool.detokenize(tokens)
            print("Doc: {} - Sent: {} - {}".format(docID, sent_id, sent_text))
            sent_obj = doc.new_sent(text=sent_text)
            sent_obj.new_tag(label=sent_ident, tagtype='origid')
            sent_id_map[sent_ident] = sent_obj
            sent_obj.tokens = tokens  # add original token in
            for (sent_token, (wident, lemma, pos, text)) in zip(sent_obj, wfs):
                sent_token.new_tag(label=wident, tagtype='origid')
                if pos:
                    sent_token.pos = pos
                if lemma:
                    sent_token.lemma = lemma
            element.clear()
    # Read tag data
    if args.keys:
        keys = chio.read_tsv(args.keys)
        wn = get_wn()
        not_found = 0
        mwe_count = 0
        # TODO Add option to split a semeval file into several documents
        for line in keys:
            from_token = line[0]
            from_token_idx = int(from_token[-3:]) - 1
            sent_id = from_token[:9]
            to_token = line[1]
            to_token_idx = int(to_token[-3:]) - 1
            if from_token != to_token:
                mwe_count += 1
                print("MWE: {}".format(line))
            bbss = line[2]
            wn_keys = [x[3:] for x in line[3:] if x.startswith('wn:')]
            found_ss = None
            for wn_key in wn_keys:
                ss = wn.get_by_key(wn_key)
                if ss is not None:
                    # print("{} => {}".format(" ".join(wn_keys), ss))
                    sent_id_map[sent_id].new_concept(tag=str(ss.ID), tokens=range(from_token_idx, to_token_idx + 1))
                    found_ss = ss
                    break
            if found_ss is None:
                getLogger().warning("Not found: {}".format(line))
                not_found += 1
        print("Total: {} - Not found: {} - MWE: {}".format(len(keys), not_found, mwe_count))
    ttl.write(args.output, doc, mode=args.ttl_format)
    print("Output file: {}".format(args.output))