コード例 #1
0
ファイル: xmldao.py プロジェクト: hoangducchinh/yawlib
 def parse_cf(self, cf_node, gloss):
     ''' Parse a word feature node and then add to gloss object
     '''
     tag = cf_node.get('tag') if not self.memory_save else ''
     lemma = StringTool.strip(
         cf_node.get('lemma')) if not self.memory_save else ''
     pos = cf_node.get('pos')
     cat = cf_node.get('type')  # if cf_node.get('type') else 'cf'
     coll = cf_node.get('coll')
     rdf = cf_node.get('rdf')
     origid = cf_node.get('id')
     sep = cf_node.get('sep')
     text = StringTool.strip(cf_node.xpath("string()"))
     cf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid,
                                   sep, text, 'coll:' + coll)
     # Parse glob info if it's available
     for child_node in cf_node:
         if child_node.tag == 'glob':
             glob_tag = child_node.get('tag')
             glob_glob = child_node.get('glob')
             glob_lemma = child_node.get('lemma')
             glob_coll = child_node.get('coll')
             glob_id = child_node.get('id')
             #            def tag_item(self, item,   cat,  tag,      glob,      glemma,     gid,     coll,      origid, sid, sk, lemma):
             tag_obj = cf_obj.gloss.tag_item(cf_obj, 'cf', glob_tag,
                                             glob_glob, glob_lemma, glob_id,
                                             glob_coll, '', '', '', '')
             for grandchild in child_node:
                 if grandchild.tag == 'id':
                     self.tag_glossitem(grandchild, cf_obj, tag_obj)
     return cf_obj
コード例 #2
0
ファイル: xmldao.py プロジェクト: letuananh/yawlib
 def parse_cf(self, cf_node, gloss):
     ''' Parse a word feature node and then add to gloss object
     '''
     tag = cf_node.get('tag') if not self.memory_save else ''
     lemma = StringTool.strip(cf_node.get('lemma')) if not self.memory_save else ''
     pos = cf_node.get('pos')
     cat = cf_node.get('type') # if cf_node.get('type') else 'cf'
     coll = cf_node.get('coll')
     rdf = cf_node.get('rdf')
     origid = cf_node.get('id')
     sep = cf_node.get('sep')
     text = StringTool.strip(cf_node.xpath("string()"))
     cf_obj =  gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid, sep, text)
     # Parse glob info if it's available
     for child_node in cf_node:
         if child_node.tag == 'glob':
             glob_tag = child_node.get('tag')
             glob_glob = child_node.get('glob')
             glob_lemma = child_node.get('lemma')
             glob_coll = child_node.get('coll')
             glob_id = child_node.get('id')
             #            def tag_item(self, item,   cat,  tag,      glob,      glemma,     gid,     coll,      origid, sid, sk, lemma):
             tag_obj = cf_obj.gloss.tag_item(cf_obj, 'cf', glob_tag, glob_glob, glob_lemma, glob_id, glob_coll, '', '', '', '')
             for grandchild in child_node:
                 if grandchild.tag == 'id':
                     self.tag_glossitem(grandchild, cf_obj, tag_obj)
     return cf_obj
コード例 #3
0
ファイル: xmldao.py プロジェクト: letuananh/yawlib
 def parse_synset(self, element):
     synset = Synset(element.get('id'),element.get('ofs'),element.get('pos')) if not self.memory_save else Synset(element.get('id'), '', '')
     for child in element:
         if child.tag == 'terms':
             for grandchild in child:
                 if grandchild.tag == 'term':
                     synset.add_term(StringTool.strip(grandchild.text))
         elif child.tag == 'keys':
             for grandchild in child:
                 if grandchild.tag == 'sk':
                     synset.add_sensekey(StringTool.strip(grandchild.text))
         elif child.tag == 'gloss' and child.get('desc') == 'orig' and not self.memory_save:
             if child[0].tag == 'orig':
                 synset.add_raw_gloss(GlossRaw.ORIG, StringTool.strip(child[0].text))
         elif child.tag == 'gloss' and child.get('desc') == 'text' and not self.memory_save:
             if child[0].tag == 'text':
                 synset.add_raw_gloss(GlossRaw.TEXT, StringTool.strip(child[0].text))
         elif child.tag == 'gloss' and child.get('desc') == 'wsd':
             for grandchild in child:
                 if grandchild.tag in ('def', 'ex'):
                     gloss = synset.add_gloss(grandchild.get('id'), StringTool.strip(grandchild.tag))
                     self.parse_gloss(grandchild, gloss)
                     # rip definition
                     pass
     #print("A synset")
     # print len(element)
     #print ','.join([ '%s (%s)' % (x.tag, ','.join([y.tag for y in x])) for x in element ])
     return synset
コード例 #4
0
ファイル: models.py プロジェクト: hoangducchinh/yawlib
 def __init__(self,
              gloss,
              tag,
              lemma,
              pos,
              cat,
              coll,
              rdf,
              origid,
              sep=None,
              text=None,
              itemid=-1):
     self.itemid = itemid
     self.gloss = gloss
     self.order = -1
     self.tag = StringTool.strip(tag)
     self.lemma = StringTool.strip(lemma)
     self.pos = StringTool.strip(pos)
     self.cat = StringTool.strip(cat)
     self.coll = StringTool.strip(coll)
     self.rdf = StringTool.strip(rdf)
     self.sep = StringTool.strip(sep)
     self.text = StringTool.strip(text)
     self.origid = StringTool.strip(origid)
     pass
コード例 #5
0
ファイル: test_leutile.py プロジェクト: letuananh/chirptext
 def test_string_tool(self):
     self.assertEqual(StringTool.strip(None), '')
     self.assertEqual(StringTool.strip(' '), '')
     self.assertEqual(StringTool.to_str(None), '')
     # detokenize
     words = ["I", "'ll", "go", "home", "."]
     self.assertEqual(StringTool.detokenize(words), "I'll go home.")
     self.assertEqual(StringTool.detokenize(["This", "(", "thing", ")", "is", "a", "comment", "!"]), "This (thing) is a comment!")
     self.assertEqual(StringTool.detokenize("He said `` why ? '' .".split()), "He said “why?”.")
     self.assertEqual(StringTool.detokenize("Where are you ?".split()), "Where are you?")
     self.assertEqual(StringTool.detokenize("Note : It works .".split()), "Note: It works.")
     self.assertEqual(StringTool.detokenize("( A ) ; ".split()), "(A);")
     self.assertEqual(StringTool.detokenize("( A ) ; B ".split()), "(A); B")
コード例 #6
0
def fix_token_text(tk):
    tk = StringTool.strip(tk).replace('\t',
                                      ' ').replace('|', ' ').replace('_', ' ')
    tk = tk.replace(" ' nuff", " 'nuff")
    tk = tk.replace("Ol ' ", "Ol' ")
    tk = tk.replace("O ' ", "O' ")
    tk = tk.replace("ma ' am", "ma'am")
    tk = tk.replace("Ma ' am", "Ma'am")
    tk = tk.replace("probl ' y", "probl'y")
    tk = tk.replace("ai n't", "ain't")
    tk = tk.replace("holdin '", "holdin'")
    tk = tk.replace("hangin '", "hangin'")
    tk = tk.replace("dryin ' ", "dryin' ")
    tk = tk.replace("Y ' all", "Y'all")
    tk = tk.replace("y ' know", "y'know")
    tk = tk.replace("c ' n", "c'n")
    tk = tk.replace("l ' identite", "l'identite")
    tk = tk.replace("Rue de L ' Arcade", "Rue de l'Arcade")
    tk = tk.replace("p ' lite", "p'lite")
    tk = tk.replace("rev ' rend", "rev'rend")
    tk = tk.replace("coup d ' etat", "coup d'etat")
    tk = tk.replace("t ' gethuh", "t'gethuh")
    tk = tk.replace('``', "“")
    tk = tk.replace("''", "”")
    tk = tk.replace(" ,", ",")
    tk = tk.replace("( ", "(")
    tk = tk.replace(" )", ")")
    tk = tk.replace(" ”", "”")
    tk = tk.replace(" 's", "'s")
    tk = tk.replace("o '", "o'")
    tk = tk.replace("s ' ", "s' ")
    tk = tk.replace(" , ", ", ")
    # tk = tk.replace(" ' ", "' ")
    return tk
コード例 #7
0
ファイル: semcorxml.py プロジェクト: letuananh/pysemcor
def fix_token_text(tk):
    tk = StringTool.strip(tk).replace('\t', ' ').replace('|', ' ').replace('_', ' ')
    tk = tk.replace(" ' nuff", " 'nuff")
    tk = tk.replace("Ol ' ", "Ol' ")
    tk = tk.replace("O ' ", "O' ")
    tk = tk.replace("ma ' am", "ma'am")
    tk = tk.replace("Ma ' am", "Ma'am")
    tk = tk.replace("probl ' y", "probl'y")
    tk = tk.replace("ai n't", "ain't")
    tk = tk.replace("holdin '", "holdin'")
    tk = tk.replace("hangin '", "hangin'")
    tk = tk.replace("dryin ' ", "dryin' ")
    tk = tk.replace("Y ' all", "Y'all")
    tk = tk.replace("y ' know", "y'know")
    tk = tk.replace("c ' n", "c'n")
    tk = tk.replace("l ' identite", "l'identite")
    tk = tk.replace("Rue de L ' Arcade", "Rue de l'Arcade")
    tk = tk.replace("p ' lite", "p'lite")
    tk = tk.replace("rev ' rend", "rev'rend")
    tk = tk.replace("coup d ' etat", "coup d'etat")
    tk = tk.replace("t ' gethuh", "t'gethuh")
    tk = tk.replace('``', "“")
    tk = tk.replace("''", "”")
    tk = tk.replace(" ,", ",")
    tk = tk.replace("( ", "(")
    tk = tk.replace(" )", ")")
    tk = tk.replace(" ”", "”")
    tk = tk.replace(" 's", "'s")
    tk = tk.replace("o '", "o'")
    tk = tk.replace("s ' ", "s' ")
    tk = tk.replace(" , ", ", ")
    # tk = tk.replace(" ' ", "' ")
    return tk
コード例 #8
0
 def iterparse(self, path):
     tree = etree.iterparse(self.files.abspath(path),
                            events=('start', 'end'))
     filename = 'n/a'
     para = 'n/a'
     for event, element in tree:
         if event == 'start':
             if element.tag == 'context':
                 filename = element.get('filename')
             elif element.tag == 'p':
                 para = element.get('pnum')
         if event == 'end':
             if element.tag == 's':
                 # found a sentence
                 snum = element.get('snum')
                 tokens = []
                 for token in element:
                     token_data = dict(token.attrib)
                     token_data['tag'] = token.tag
                     text = fix_token_text(token.text)
                     if token.tag == 'wf':
                         # create sensekey
                         lemma = StringTool.strip(token.get('lemma'))
                         lexsn = StringTool.strip(token.get('lexsn'))
                         sk = lemma + '%' + lexsn if lemma and lexsn else ''
                         sk = StringTool.strip(
                             sk.replace('\t', ' ').replace('|', ' '))
                         if sk:
                             token_data['sk'] = sk
                         tokens.append(TokenInfo(text, **token_data))
                     elif token.tag == 'punc':
                         tokens.append(TokenInfo(text, **token_data))
                 element.clear()
                 s = {
                     'para': para,
                     'filename': filename,
                     'snum': snum,
                     'sid': "{}-{}-{}".format(filename, para, snum),
                     'tokens': tokens
                 }
                 yield s
             elif element.tag == 'p':
                 para = 'n/a'
                 element.clear()
             elif element.tag == 'context':
                 filename = 'n/a'
                 element.clear()
コード例 #9
0
ファイル: models.py プロジェクト: letuananh/yawlib
 def __init__(self, gloss, tag, lemma, pos, cat, coll, rdf, origid, sep=None, text=None, itemid=-1):
     self.itemid = itemid
     self.gloss = gloss
     self.order = -1
     self.tag = StringTool.strip(tag)
     self.lemma = StringTool.strip(lemma)
     self.pos = StringTool.strip(pos)
     self.cat = StringTool.strip(cat)
     self.coll = StringTool.strip(coll)
     self.rdf = StringTool.strip(rdf)
     self.sep = StringTool.strip(sep)
     self.text = StringTool.strip(text)
     self.origid = StringTool.strip(origid)
     pass
コード例 #10
0
ファイル: xmldao.py プロジェクト: letuananh/yawlib
    def tag_glossitem(self, id_node, glossitem, tag_obj):
        ''' Parse ID element and tag a glossitem
        '''
        sk = StringTool.strip(id_node.get('sk'))
        origid = StringTool.strip(id_node.get('id'))
        coll = StringTool.strip(id_node.get('coll'))
        lemma = StringTool.strip(id_node.get('lemma'))

        if tag_obj is None:
            tag_obj = glossitem.gloss.tag_item(glossitem, '', '', '', '', '', coll, origid, '', sk, lemma)
        else:
            tag_obj.sk     = sk
            tag_obj.origid = origid
            tag_obj.coll   = coll
            tag_obj.lemma  = lemma

        # WEIRD STUFF: lemma="purposefully ignored" sk="purposefully_ignored%0:00:00::"
        if lemma == 'purposefully ignored' and sk == "purposefully_ignored%0:00:00::":
            tag_obj.cat = 'PURPOSEFULLY_IGNORED'
コード例 #11
0
ファイル: semcorxml.py プロジェクト: letuananh/pysemcor
 def iterparse(self, path):
     tree = etree.iterparse(self.files.abspath(path), events=('start', 'end'))
     filename = 'n/a'
     para = 'n/a'
     for event, element in tree:
         if event == 'start':
             if element.tag == 'context':
                 filename = element.get('filename')
             elif element.tag == 'p':
                 para = element.get('pnum')
         if event == 'end':
             if element.tag == 's':
                 # found a sentence
                 snum = element.get('snum')
                 tokens = []
                 for token in element:
                     token_data = dict(token.attrib)
                     token_data['tag'] = token.tag
                     text = fix_token_text(token.text)
                     if token.tag == 'wf':
                         # create sensekey
                         lemma = StringTool.strip(token.get('lemma'))
                         lexsn = StringTool.strip(token.get('lexsn'))
                         sk = lemma + '%' + lexsn if lemma and lexsn else ''
                         sk = StringTool.strip(sk.replace('\t', ' ').replace('|', ' '))
                         if sk:
                             token_data['sk'] = sk
                         tokens.append(TokenInfo(text, **token_data))
                     elif token.tag == 'punc':
                         tokens.append(TokenInfo(text, **token_data))
                 element.clear()
                 s = {'para': para,
                      'filename': filename,
                      'snum': snum,
                      'sid': "{}-{}-{}".format(filename, para, snum),
                      'tokens': tokens}
                 yield s
             elif element.tag == 'p':
                 para = 'n/a'
                 element.clear()
             elif element.tag == 'context':
                 filename = 'n/a'
                 element.clear()
コード例 #12
0
ファイル: xmldao.py プロジェクト: hoangducchinh/yawlib
    def tag_glossitem(self, id_node, glossitem, tag_obj):
        ''' Parse ID element and tag a glossitem
        '''
        sk = StringTool.strip(id_node.get('sk'))
        origid = StringTool.strip(id_node.get('id'))
        coll = StringTool.strip(id_node.get('coll'))
        lemma = StringTool.strip(id_node.get('lemma'))

        if tag_obj is None:
            tag_obj = glossitem.gloss.tag_item(glossitem, '', '', '', '', '',
                                               coll, origid, '', sk, lemma)
        else:
            tag_obj.itemid = glossitem.origid
            tag_obj.sk = sk
            tag_obj.origid = origid
            tag_obj.coll = coll
            tag_obj.lemma = lemma

        # WEIRD STUFF: lemma="purposefully ignored" sk="purposefully_ignored%0:00:00::"
        if lemma == 'purposefully ignored' and sk == "purposefully_ignored%0:00:00::":
            tag_obj.cat = 'PURPOSEFULLY_IGNORED'
コード例 #13
0
ファイル: xmldao.py プロジェクト: hoangducchinh/yawlib
 def parse_synset(self, element):
     synset = GlossedSynset(element.get('id'))
     for child in element:
         if child.tag == 'terms':
             for grandchild in child:
                 # term is a lemma
                 if grandchild.tag == 'term':
                     synset.add_lemma(StringTool.strip(grandchild.text))
         elif child.tag == 'keys':
             for grandchild in child:
                 if grandchild.tag == 'sk':
                     synset.add_key(StringTool.strip(grandchild.text))
         elif child.tag == 'gloss' and child.get(
                 'desc') == 'orig' and not self.memory_save:
             if child[0].tag == 'orig':
                 synset.add_raw_gloss(GlossRaw.ORIG,
                                      StringTool.strip(child[0].text))
         elif child.tag == 'gloss' and child.get(
                 'desc') == 'text' and not self.memory_save:
             if child[0].tag == 'text':
                 synset.add_raw_gloss(GlossRaw.TEXT,
                                      StringTool.strip(child[0].text))
         elif child.tag == 'gloss' and child.get('desc') == 'wsd':
             for grandchild in child:
                 # [2016-02-12 LTA] aux should be parsed as well
                 if grandchild.tag in ('def', 'ex', 'aux'):
                     gloss = synset.add_gloss(
                         grandchild.get('id'),
                         StringTool.strip(grandchild.tag))
                     self.parse_gloss(grandchild, gloss)
                     # rip definition
                     pass
     #print("A synset")
     # print len(element)
     #print ','.join([ '%s (%s)' % (x.tag, ','.join([y.tag for y in x])) for x in element ])
     return synset
コード例 #14
0
ファイル: xmldao.py プロジェクト: letuananh/yawlib
 def parse_wf(self, wf_node, gloss):
     ''' Parse a word feature node and then add to gloss object
     '''
     tag = wf_node.get('tag') if not self.memory_save else ''
     lemma = wf_node.get('lemma') if not self.memory_save else ''
     pos = wf_node.get('pos')
     cat = wf_node.get('type') # if wf_node.get('type') else 'wf'
     coll = None # wf_node.get('coll')
     rdf = wf_node.get('rdf')
     origid = wf_node.get('id')
     sep = wf_node.get('sep')
     text = StringTool.strip(wf_node.xpath("string()")) # XML mixed content, don't use text attr here
     wf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid, sep, text)
     # Then parse id tag if available
     for child in wf_node:
         if child.tag == 'id':
             self.tag_glossitem(child, wf_obj, None)
     return wf_obj
コード例 #15
0
ファイル: xmldao.py プロジェクト: hoangducchinh/yawlib
 def parse_wf(self, wf_node, gloss):
     ''' Parse a word feature node and then add to gloss object
     '''
     tag = wf_node.get('tag') if not self.memory_save else ''
     lemma = wf_node.get('lemma') if not self.memory_save else ''
     pos = wf_node.get('pos')
     cat = wf_node.get('type')  # if wf_node.get('type') else 'wf'
     coll = None  # wf_node.get('coll')
     rdf = wf_node.get('rdf')
     origid = wf_node.get('id')
     sep = wf_node.get('sep')
     text = StringTool.strip(wf_node.xpath(
         "string()"))  # XML mixed content, don't use text attr here
     wf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid,
                                   sep, text, origid)
     # Then parse id tag if available
     for child in wf_node:
         if child.tag == 'id':
             self.tag_glossitem(child, wf_obj, None)
     return wf_obj
コード例 #16
0
ファイル: models.py プロジェクト: hoangducchinh/yawlib
 def __init__(self, synset, cat, gloss):
     self.synset = synset
     self.cat = StringTool.strip(cat)
     self.gloss = StringTool.strip(gloss)
コード例 #17
0
ファイル: models.py プロジェクト: letuananh/yawlib
 def __init__(self, synset, cat, gloss):
     self.synset = synset
     self.cat = StringTool.strip(cat)
     self.gloss = StringTool.strip(gloss)