コード例 #1
0
ファイル: xmldao.py プロジェクト: hoangducchinh/yawlib
 def parse_cf(self, cf_node, gloss):
     ''' Parse a word feature node and then add to gloss object
     '''
     tag = cf_node.get('tag') if not self.memory_save else ''
     lemma = StringTool.strip(
         cf_node.get('lemma')) if not self.memory_save else ''
     pos = cf_node.get('pos')
     cat = cf_node.get('type')  # if cf_node.get('type') else 'cf'
     coll = cf_node.get('coll')
     rdf = cf_node.get('rdf')
     origid = cf_node.get('id')
     sep = cf_node.get('sep')
     text = StringTool.strip(cf_node.xpath("string()"))
     cf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid,
                                   sep, text, 'coll:' + coll)
     # Parse glob info if it's available
     for child_node in cf_node:
         if child_node.tag == 'glob':
             glob_tag = child_node.get('tag')
             glob_glob = child_node.get('glob')
             glob_lemma = child_node.get('lemma')
             glob_coll = child_node.get('coll')
             glob_id = child_node.get('id')
             #            def tag_item(self, item,   cat,  tag,      glob,      glemma,     gid,     coll,      origid, sid, sk, lemma):
             tag_obj = cf_obj.gloss.tag_item(cf_obj, 'cf', glob_tag,
                                             glob_glob, glob_lemma, glob_id,
                                             glob_coll, '', '', '', '')
             for grandchild in child_node:
                 if grandchild.tag == 'id':
                     self.tag_glossitem(grandchild, cf_obj, tag_obj)
     return cf_obj
コード例 #2
0
ファイル: xmldao.py プロジェクト: letuananh/yawlib
 def parse_cf(self, cf_node, gloss):
     ''' Parse a word feature node and then add to gloss object
     '''
     tag = cf_node.get('tag') if not self.memory_save else ''
     lemma = StringTool.strip(cf_node.get('lemma')) if not self.memory_save else ''
     pos = cf_node.get('pos')
     cat = cf_node.get('type') # if cf_node.get('type') else 'cf'
     coll = cf_node.get('coll')
     rdf = cf_node.get('rdf')
     origid = cf_node.get('id')
     sep = cf_node.get('sep')
     text = StringTool.strip(cf_node.xpath("string()"))
     cf_obj =  gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid, sep, text)
     # Parse glob info if it's available
     for child_node in cf_node:
         if child_node.tag == 'glob':
             glob_tag = child_node.get('tag')
             glob_glob = child_node.get('glob')
             glob_lemma = child_node.get('lemma')
             glob_coll = child_node.get('coll')
             glob_id = child_node.get('id')
             #            def tag_item(self, item,   cat,  tag,      glob,      glemma,     gid,     coll,      origid, sid, sk, lemma):
             tag_obj = cf_obj.gloss.tag_item(cf_obj, 'cf', glob_tag, glob_glob, glob_lemma, glob_id, glob_coll, '', '', '', '')
             for grandchild in child_node:
                 if grandchild.tag == 'id':
                     self.tag_glossitem(grandchild, cf_obj, tag_obj)
     return cf_obj
コード例 #3
0
ファイル: xmldao.py プロジェクト: letuananh/yawlib
 def parse_synset(self, element):
     synset = Synset(element.get('id'),element.get('ofs'),element.get('pos')) if not self.memory_save else Synset(element.get('id'), '', '')
     for child in element:
         if child.tag == 'terms':
             for grandchild in child:
                 if grandchild.tag == 'term':
                     synset.add_term(StringTool.strip(grandchild.text))
         elif child.tag == 'keys':
             for grandchild in child:
                 if grandchild.tag == 'sk':
                     synset.add_sensekey(StringTool.strip(grandchild.text))
         elif child.tag == 'gloss' and child.get('desc') == 'orig' and not self.memory_save:
             if child[0].tag == 'orig':
                 synset.add_raw_gloss(GlossRaw.ORIG, StringTool.strip(child[0].text))
         elif child.tag == 'gloss' and child.get('desc') == 'text' and not self.memory_save:
             if child[0].tag == 'text':
                 synset.add_raw_gloss(GlossRaw.TEXT, StringTool.strip(child[0].text))
         elif child.tag == 'gloss' and child.get('desc') == 'wsd':
             for grandchild in child:
                 if grandchild.tag in ('def', 'ex'):
                     gloss = synset.add_gloss(grandchild.get('id'), StringTool.strip(grandchild.tag))
                     self.parse_gloss(grandchild, gloss)
                     # rip definition
                     pass
     #print("A synset")
     # print len(element)
     #print ','.join([ '%s (%s)' % (x.tag, ','.join([y.tag for y in x])) for x in element ])
     return synset
コード例 #4
0
ファイル: models.py プロジェクト: hoangducchinh/yawlib
 def __init__(self,
              gloss,
              tag,
              lemma,
              pos,
              cat,
              coll,
              rdf,
              origid,
              sep=None,
              text=None,
              itemid=-1):
     self.itemid = itemid
     self.gloss = gloss
     self.order = -1
     self.tag = StringTool.strip(tag)
     self.lemma = StringTool.strip(lemma)
     self.pos = StringTool.strip(pos)
     self.cat = StringTool.strip(cat)
     self.coll = StringTool.strip(coll)
     self.rdf = StringTool.strip(rdf)
     self.sep = StringTool.strip(sep)
     self.text = StringTool.strip(text)
     self.origid = StringTool.strip(origid)
     pass
コード例 #5
0
ファイル: test_leutile.py プロジェクト: letuananh/chirptext
 def test_string_tool(self):
     self.assertEqual(StringTool.strip(None), '')
     self.assertEqual(StringTool.strip(' '), '')
     self.assertEqual(StringTool.to_str(None), '')
     # detokenize
     words = ["I", "'ll", "go", "home", "."]
     self.assertEqual(StringTool.detokenize(words), "I'll go home.")
     self.assertEqual(StringTool.detokenize(["This", "(", "thing", ")", "is", "a", "comment", "!"]), "This (thing) is a comment!")
     self.assertEqual(StringTool.detokenize("He said `` why ? '' .".split()), "He said “why?”.")
     self.assertEqual(StringTool.detokenize("Where are you ?".split()), "Where are you?")
     self.assertEqual(StringTool.detokenize("Note : It works .".split()), "Note: It works.")
     self.assertEqual(StringTool.detokenize("( A ) ; ".split()), "(A);")
     self.assertEqual(StringTool.detokenize("( A ) ; B ".split()), "(A); B")
コード例 #6
0
def fix_token_text(tk):
    tk = StringTool.strip(tk).replace('\t',
                                      ' ').replace('|', ' ').replace('_', ' ')
    tk = tk.replace(" ' nuff", " 'nuff")
    tk = tk.replace("Ol ' ", "Ol' ")
    tk = tk.replace("O ' ", "O' ")
    tk = tk.replace("ma ' am", "ma'am")
    tk = tk.replace("Ma ' am", "Ma'am")
    tk = tk.replace("probl ' y", "probl'y")
    tk = tk.replace("ai n't", "ain't")
    tk = tk.replace("holdin '", "holdin'")
    tk = tk.replace("hangin '", "hangin'")
    tk = tk.replace("dryin ' ", "dryin' ")
    tk = tk.replace("Y ' all", "Y'all")
    tk = tk.replace("y ' know", "y'know")
    tk = tk.replace("c ' n", "c'n")
    tk = tk.replace("l ' identite", "l'identite")
    tk = tk.replace("Rue de L ' Arcade", "Rue de l'Arcade")
    tk = tk.replace("p ' lite", "p'lite")
    tk = tk.replace("rev ' rend", "rev'rend")
    tk = tk.replace("coup d ' etat", "coup d'etat")
    tk = tk.replace("t ' gethuh", "t'gethuh")
    tk = tk.replace('``', "“")
    tk = tk.replace("''", "”")
    tk = tk.replace(" ,", ",")
    tk = tk.replace("( ", "(")
    tk = tk.replace(" )", ")")
    tk = tk.replace(" ”", "”")
    tk = tk.replace(" 's", "'s")
    tk = tk.replace("o '", "o'")
    tk = tk.replace("s ' ", "s' ")
    tk = tk.replace(" , ", ", ")
    # tk = tk.replace(" ' ", "' ")
    return tk
コード例 #7
0
ファイル: semcorxml.py プロジェクト: letuananh/pysemcor
def fix_token_text(tk):
    tk = StringTool.strip(tk).replace('\t', ' ').replace('|', ' ').replace('_', ' ')
    tk = tk.replace(" ' nuff", " 'nuff")
    tk = tk.replace("Ol ' ", "Ol' ")
    tk = tk.replace("O ' ", "O' ")
    tk = tk.replace("ma ' am", "ma'am")
    tk = tk.replace("Ma ' am", "Ma'am")
    tk = tk.replace("probl ' y", "probl'y")
    tk = tk.replace("ai n't", "ain't")
    tk = tk.replace("holdin '", "holdin'")
    tk = tk.replace("hangin '", "hangin'")
    tk = tk.replace("dryin ' ", "dryin' ")
    tk = tk.replace("Y ' all", "Y'all")
    tk = tk.replace("y ' know", "y'know")
    tk = tk.replace("c ' n", "c'n")
    tk = tk.replace("l ' identite", "l'identite")
    tk = tk.replace("Rue de L ' Arcade", "Rue de l'Arcade")
    tk = tk.replace("p ' lite", "p'lite")
    tk = tk.replace("rev ' rend", "rev'rend")
    tk = tk.replace("coup d ' etat", "coup d'etat")
    tk = tk.replace("t ' gethuh", "t'gethuh")
    tk = tk.replace('``', "“")
    tk = tk.replace("''", "”")
    tk = tk.replace(" ,", ",")
    tk = tk.replace("( ", "(")
    tk = tk.replace(" )", ")")
    tk = tk.replace(" ”", "”")
    tk = tk.replace(" 's", "'s")
    tk = tk.replace("o '", "o'")
    tk = tk.replace("s ' ", "s' ")
    tk = tk.replace(" , ", ", ")
    # tk = tk.replace(" ' ", "' ")
    return tk
コード例 #8
0
 def iterparse(self, path):
     tree = etree.iterparse(self.files.abspath(path),
                            events=('start', 'end'))
     filename = 'n/a'
     para = 'n/a'
     for event, element in tree:
         if event == 'start':
             if element.tag == 'context':
                 filename = element.get('filename')
             elif element.tag == 'p':
                 para = element.get('pnum')
         if event == 'end':
             if element.tag == 's':
                 # found a sentence
                 snum = element.get('snum')
                 tokens = []
                 for token in element:
                     token_data = dict(token.attrib)
                     token_data['tag'] = token.tag
                     text = fix_token_text(token.text)
                     if token.tag == 'wf':
                         # create sensekey
                         lemma = StringTool.strip(token.get('lemma'))
                         lexsn = StringTool.strip(token.get('lexsn'))
                         sk = lemma + '%' + lexsn if lemma and lexsn else ''
                         sk = StringTool.strip(
                             sk.replace('\t', ' ').replace('|', ' '))
                         if sk:
                             token_data['sk'] = sk
                         tokens.append(TokenInfo(text, **token_data))
                     elif token.tag == 'punc':
                         tokens.append(TokenInfo(text, **token_data))
                 element.clear()
                 s = {
                     'para': para,
                     'filename': filename,
                     'snum': snum,
                     'sid': "{}-{}-{}".format(filename, para, snum),
                     'tokens': tokens
                 }
                 yield s
             elif element.tag == 'p':
                 para = 'n/a'
                 element.clear()
             elif element.tag == 'context':
                 filename = 'n/a'
                 element.clear()
コード例 #9
0
ファイル: models.py プロジェクト: letuananh/yawlib
 def __init__(self, gloss, tag, lemma, pos, cat, coll, rdf, origid, sep=None, text=None, itemid=-1):
     self.itemid = itemid
     self.gloss = gloss
     self.order = -1
     self.tag = StringTool.strip(tag)
     self.lemma = StringTool.strip(lemma)
     self.pos = StringTool.strip(pos)
     self.cat = StringTool.strip(cat)
     self.coll = StringTool.strip(coll)
     self.rdf = StringTool.strip(rdf)
     self.sep = StringTool.strip(sep)
     self.text = StringTool.strip(text)
     self.origid = StringTool.strip(origid)
     pass
コード例 #10
0
ファイル: xmldao.py プロジェクト: letuananh/yawlib
    def tag_glossitem(self, id_node, glossitem, tag_obj):
        ''' Parse ID element and tag a glossitem
        '''
        sk = StringTool.strip(id_node.get('sk'))
        origid = StringTool.strip(id_node.get('id'))
        coll = StringTool.strip(id_node.get('coll'))
        lemma = StringTool.strip(id_node.get('lemma'))

        if tag_obj is None:
            tag_obj = glossitem.gloss.tag_item(glossitem, '', '', '', '', '', coll, origid, '', sk, lemma)
        else:
            tag_obj.sk     = sk
            tag_obj.origid = origid
            tag_obj.coll   = coll
            tag_obj.lemma  = lemma

        # WEIRD STUFF: lemma="purposefully ignored" sk="purposefully_ignored%0:00:00::"
        if lemma == 'purposefully ignored' and sk == "purposefully_ignored%0:00:00::":
            tag_obj.cat = 'PURPOSEFULLY_IGNORED'
コード例 #11
0
ファイル: semcorxml.py プロジェクト: letuananh/pysemcor
 def iterparse(self, path):
     tree = etree.iterparse(self.files.abspath(path), events=('start', 'end'))
     filename = 'n/a'
     para = 'n/a'
     for event, element in tree:
         if event == 'start':
             if element.tag == 'context':
                 filename = element.get('filename')
             elif element.tag == 'p':
                 para = element.get('pnum')
         if event == 'end':
             if element.tag == 's':
                 # found a sentence
                 snum = element.get('snum')
                 tokens = []
                 for token in element:
                     token_data = dict(token.attrib)
                     token_data['tag'] = token.tag
                     text = fix_token_text(token.text)
                     if token.tag == 'wf':
                         # create sensekey
                         lemma = StringTool.strip(token.get('lemma'))
                         lexsn = StringTool.strip(token.get('lexsn'))
                         sk = lemma + '%' + lexsn if lemma and lexsn else ''
                         sk = StringTool.strip(sk.replace('\t', ' ').replace('|', ' '))
                         if sk:
                             token_data['sk'] = sk
                         tokens.append(TokenInfo(text, **token_data))
                     elif token.tag == 'punc':
                         tokens.append(TokenInfo(text, **token_data))
                 element.clear()
                 s = {'para': para,
                      'filename': filename,
                      'snum': snum,
                      'sid': "{}-{}-{}".format(filename, para, snum),
                      'tokens': tokens}
                 yield s
             elif element.tag == 'p':
                 para = 'n/a'
                 element.clear()
             elif element.tag == 'context':
                 filename = 'n/a'
                 element.clear()
コード例 #12
0
ファイル: xmldao.py プロジェクト: hoangducchinh/yawlib
    def tag_glossitem(self, id_node, glossitem, tag_obj):
        ''' Parse ID element and tag a glossitem
        '''
        sk = StringTool.strip(id_node.get('sk'))
        origid = StringTool.strip(id_node.get('id'))
        coll = StringTool.strip(id_node.get('coll'))
        lemma = StringTool.strip(id_node.get('lemma'))

        if tag_obj is None:
            tag_obj = glossitem.gloss.tag_item(glossitem, '', '', '', '', '',
                                               coll, origid, '', sk, lemma)
        else:
            tag_obj.itemid = glossitem.origid
            tag_obj.sk = sk
            tag_obj.origid = origid
            tag_obj.coll = coll
            tag_obj.lemma = lemma

        # WEIRD STUFF: lemma="purposefully ignored" sk="purposefully_ignored%0:00:00::"
        if lemma == 'purposefully ignored' and sk == "purposefully_ignored%0:00:00::":
            tag_obj.cat = 'PURPOSEFULLY_IGNORED'
コード例 #13
0
ファイル: xmldao.py プロジェクト: hoangducchinh/yawlib
 def parse_synset(self, element):
     synset = GlossedSynset(element.get('id'))
     for child in element:
         if child.tag == 'terms':
             for grandchild in child:
                 # term is a lemma
                 if grandchild.tag == 'term':
                     synset.add_lemma(StringTool.strip(grandchild.text))
         elif child.tag == 'keys':
             for grandchild in child:
                 if grandchild.tag == 'sk':
                     synset.add_key(StringTool.strip(grandchild.text))
         elif child.tag == 'gloss' and child.get(
                 'desc') == 'orig' and not self.memory_save:
             if child[0].tag == 'orig':
                 synset.add_raw_gloss(GlossRaw.ORIG,
                                      StringTool.strip(child[0].text))
         elif child.tag == 'gloss' and child.get(
                 'desc') == 'text' and not self.memory_save:
             if child[0].tag == 'text':
                 synset.add_raw_gloss(GlossRaw.TEXT,
                                      StringTool.strip(child[0].text))
         elif child.tag == 'gloss' and child.get('desc') == 'wsd':
             for grandchild in child:
                 # [2016-02-12 LTA] aux should be parsed as well
                 if grandchild.tag in ('def', 'ex', 'aux'):
                     gloss = synset.add_gloss(
                         grandchild.get('id'),
                         StringTool.strip(grandchild.tag))
                     self.parse_gloss(grandchild, gloss)
                     # rip definition
                     pass
     #print("A synset")
     # print len(element)
     #print ','.join([ '%s (%s)' % (x.tag, ','.join([y.tag for y in x])) for x in element ])
     return synset
コード例 #14
0
ファイル: xmldao.py プロジェクト: letuananh/yawlib
 def parse_wf(self, wf_node, gloss):
     ''' Parse a word feature node and then add to gloss object
     '''
     tag = wf_node.get('tag') if not self.memory_save else ''
     lemma = wf_node.get('lemma') if not self.memory_save else ''
     pos = wf_node.get('pos')
     cat = wf_node.get('type') # if wf_node.get('type') else 'wf'
     coll = None # wf_node.get('coll')
     rdf = wf_node.get('rdf')
     origid = wf_node.get('id')
     sep = wf_node.get('sep')
     text = StringTool.strip(wf_node.xpath("string()")) # XML mixed content, don't use text attr here
     wf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid, sep, text)
     # Then parse id tag if available
     for child in wf_node:
         if child.tag == 'id':
             self.tag_glossitem(child, wf_obj, None)
     return wf_obj
コード例 #15
0
ファイル: xmldao.py プロジェクト: hoangducchinh/yawlib
 def parse_wf(self, wf_node, gloss):
     ''' Parse a word feature node and then add to gloss object
     '''
     tag = wf_node.get('tag') if not self.memory_save else ''
     lemma = wf_node.get('lemma') if not self.memory_save else ''
     pos = wf_node.get('pos')
     cat = wf_node.get('type')  # if wf_node.get('type') else 'wf'
     coll = None  # wf_node.get('coll')
     rdf = wf_node.get('rdf')
     origid = wf_node.get('id')
     sep = wf_node.get('sep')
     text = StringTool.strip(wf_node.xpath(
         "string()"))  # XML mixed content, don't use text attr here
     wf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid,
                                   sep, text, origid)
     # Then parse id tag if available
     for child in wf_node:
         if child.tag == 'id':
             self.tag_glossitem(child, wf_obj, None)
     return wf_obj
コード例 #16
0
ファイル: ttl.py プロジェクト: letuananh/intsem.fx
def semeval_to_ttl(cli, args):
    print("Semeval file: {}".format(args.input))
    print("Semeval key file: {}".format(args.keys))
    print("TTL file: {}".format(args.output))
    print("TTL format: {}".format(args.ttl_format))
    # Read document data
    tree = etree.iterparse(args.input)
    doc = ttl.Document()
    sent_id_map = {}
    for event, element in tree:
        if event == 'end' and element.tag == 'sentence':
            # do some processing here
            sent_ident = element.get('id')
            tokens = []
            tids = []
            # docID & sentID
            docID = sent_ident[1:4]
            sent_id = sent_ident[6:9]
            wfs = []
            for wf in element:
                wident, lemma, pos, text = wf.get('id'), wf.get('lemma'), wf.get('pos'), wf.text
                wfs.append((wident, lemma, pos, text))
                wid = wident[11:]
                tokens.append(text)
                tids.append('{}/{}'.format(wid, lemma))
            sent_text = StringTool.detokenize(tokens)
            print("Doc: {} - Sent: {} - {}".format(docID, sent_id, sent_text))
            sent_obj = doc.new_sent(text=sent_text)
            sent_obj.new_tag(label=sent_ident, tagtype='origid')
            sent_id_map[sent_ident] = sent_obj
            sent_obj.tokens = tokens  # add original token in
            for (sent_token, (wident, lemma, pos, text)) in zip(sent_obj, wfs):
                sent_token.new_tag(label=wident, tagtype='origid')
                if pos:
                    sent_token.pos = pos
                if lemma:
                    sent_token.lemma = lemma
            element.clear()
    # Read tag data
    if args.keys:
        keys = chio.read_tsv(args.keys)
        wn = get_wn()
        not_found = 0
        mwe_count = 0
        # TODO Add option to split a semeval file into several documents
        for line in keys:
            from_token = line[0]
            from_token_idx = int(from_token[-3:]) - 1
            sent_id = from_token[:9]
            to_token = line[1]
            to_token_idx = int(to_token[-3:]) - 1
            if from_token != to_token:
                mwe_count += 1
                print("MWE: {}".format(line))
            bbss = line[2]
            wn_keys = [x[3:] for x in line[3:] if x.startswith('wn:')]
            found_ss = None
            for wn_key in wn_keys:
                ss = wn.get_by_key(wn_key)
                if ss is not None:
                    # print("{} => {}".format(" ".join(wn_keys), ss))
                    sent_id_map[sent_id].new_concept(tag=str(ss.ID), tokens=range(from_token_idx, to_token_idx + 1))
                    found_ss = ss
                    break
            if found_ss is None:
                getLogger().warning("Not found: {}".format(line))
                not_found += 1
        print("Total: {} - Not found: {} - MWE: {}".format(len(keys), not_found, mwe_count))
    ttl.write(args.output, doc, mode=args.ttl_format)
    print("Output file: {}".format(args.output))
コード例 #17
0
def join(token, *items):
    return token.join((StringTool.to_str(x) for x in items))
コード例 #18
0
ファイル: models.py プロジェクト: hoangducchinh/yawlib
 def __init__(self, synset, cat, gloss):
     self.synset = synset
     self.cat = StringTool.strip(cat)
     self.gloss = StringTool.strip(gloss)
コード例 #19
0
ファイル: models.py プロジェクト: letuananh/yawlib
 def __init__(self, synset, cat, gloss):
     self.synset = synset
     self.cat = StringTool.strip(cat)
     self.gloss = StringTool.strip(gloss)