def parse_cf(self, cf_node, gloss): ''' Parse a word feature node and then add to gloss object ''' tag = cf_node.get('tag') if not self.memory_save else '' lemma = StringTool.strip( cf_node.get('lemma')) if not self.memory_save else '' pos = cf_node.get('pos') cat = cf_node.get('type') # if cf_node.get('type') else 'cf' coll = cf_node.get('coll') rdf = cf_node.get('rdf') origid = cf_node.get('id') sep = cf_node.get('sep') text = StringTool.strip(cf_node.xpath("string()")) cf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid, sep, text, 'coll:' + coll) # Parse glob info if it's available for child_node in cf_node: if child_node.tag == 'glob': glob_tag = child_node.get('tag') glob_glob = child_node.get('glob') glob_lemma = child_node.get('lemma') glob_coll = child_node.get('coll') glob_id = child_node.get('id') # def tag_item(self, item, cat, tag, glob, glemma, gid, coll, origid, sid, sk, lemma): tag_obj = cf_obj.gloss.tag_item(cf_obj, 'cf', glob_tag, glob_glob, glob_lemma, glob_id, glob_coll, '', '', '', '') for grandchild in child_node: if grandchild.tag == 'id': self.tag_glossitem(grandchild, cf_obj, tag_obj) return cf_obj
def parse_cf(self, cf_node, gloss): ''' Parse a word feature node and then add to gloss object ''' tag = cf_node.get('tag') if not self.memory_save else '' lemma = StringTool.strip(cf_node.get('lemma')) if not self.memory_save else '' pos = cf_node.get('pos') cat = cf_node.get('type') # if cf_node.get('type') else 'cf' coll = cf_node.get('coll') rdf = cf_node.get('rdf') origid = cf_node.get('id') sep = cf_node.get('sep') text = StringTool.strip(cf_node.xpath("string()")) cf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid, sep, text) # Parse glob info if it's available for child_node in cf_node: if child_node.tag == 'glob': glob_tag = child_node.get('tag') glob_glob = child_node.get('glob') glob_lemma = child_node.get('lemma') glob_coll = child_node.get('coll') glob_id = child_node.get('id') # def tag_item(self, item, cat, tag, glob, glemma, gid, coll, origid, sid, sk, lemma): tag_obj = cf_obj.gloss.tag_item(cf_obj, 'cf', glob_tag, glob_glob, glob_lemma, glob_id, glob_coll, '', '', '', '') for grandchild in child_node: if grandchild.tag == 'id': self.tag_glossitem(grandchild, cf_obj, tag_obj) return cf_obj
def parse_synset(self, element): synset = Synset(element.get('id'),element.get('ofs'),element.get('pos')) if not self.memory_save else Synset(element.get('id'), '', '') for child in element: if child.tag == 'terms': for grandchild in child: if grandchild.tag == 'term': synset.add_term(StringTool.strip(grandchild.text)) elif child.tag == 'keys': for grandchild in child: if grandchild.tag == 'sk': synset.add_sensekey(StringTool.strip(grandchild.text)) elif child.tag == 'gloss' and child.get('desc') == 'orig' and not self.memory_save: if child[0].tag == 'orig': synset.add_raw_gloss(GlossRaw.ORIG, StringTool.strip(child[0].text)) elif child.tag == 'gloss' and child.get('desc') == 'text' and not self.memory_save: if child[0].tag == 'text': synset.add_raw_gloss(GlossRaw.TEXT, StringTool.strip(child[0].text)) elif child.tag == 'gloss' and child.get('desc') == 'wsd': for grandchild in child: if grandchild.tag in ('def', 'ex'): gloss = synset.add_gloss(grandchild.get('id'), StringTool.strip(grandchild.tag)) self.parse_gloss(grandchild, gloss) # rip definition pass #print("A synset") # print len(element) #print ','.join([ '%s (%s)' % (x.tag, ','.join([y.tag for y in x])) for x in element ]) return synset
def __init__(self, gloss, tag, lemma, pos, cat, coll, rdf, origid, sep=None, text=None, itemid=-1): self.itemid = itemid self.gloss = gloss self.order = -1 self.tag = StringTool.strip(tag) self.lemma = StringTool.strip(lemma) self.pos = StringTool.strip(pos) self.cat = StringTool.strip(cat) self.coll = StringTool.strip(coll) self.rdf = StringTool.strip(rdf) self.sep = StringTool.strip(sep) self.text = StringTool.strip(text) self.origid = StringTool.strip(origid) pass
def test_string_tool(self): self.assertEqual(StringTool.strip(None), '') self.assertEqual(StringTool.strip(' '), '') self.assertEqual(StringTool.to_str(None), '') # detokenize words = ["I", "'ll", "go", "home", "."] self.assertEqual(StringTool.detokenize(words), "I'll go home.") self.assertEqual(StringTool.detokenize(["This", "(", "thing", ")", "is", "a", "comment", "!"]), "This (thing) is a comment!") self.assertEqual(StringTool.detokenize("He said `` why ? '' .".split()), "He said “why?”.") self.assertEqual(StringTool.detokenize("Where are you ?".split()), "Where are you?") self.assertEqual(StringTool.detokenize("Note : It works .".split()), "Note: It works.") self.assertEqual(StringTool.detokenize("( A ) ; ".split()), "(A);") self.assertEqual(StringTool.detokenize("( A ) ; B ".split()), "(A); B")
def fix_token_text(tk): tk = StringTool.strip(tk).replace('\t', ' ').replace('|', ' ').replace('_', ' ') tk = tk.replace(" ' nuff", " 'nuff") tk = tk.replace("Ol ' ", "Ol' ") tk = tk.replace("O ' ", "O' ") tk = tk.replace("ma ' am", "ma'am") tk = tk.replace("Ma ' am", "Ma'am") tk = tk.replace("probl ' y", "probl'y") tk = tk.replace("ai n't", "ain't") tk = tk.replace("holdin '", "holdin'") tk = tk.replace("hangin '", "hangin'") tk = tk.replace("dryin ' ", "dryin' ") tk = tk.replace("Y ' all", "Y'all") tk = tk.replace("y ' know", "y'know") tk = tk.replace("c ' n", "c'n") tk = tk.replace("l ' identite", "l'identite") tk = tk.replace("Rue de L ' Arcade", "Rue de l'Arcade") tk = tk.replace("p ' lite", "p'lite") tk = tk.replace("rev ' rend", "rev'rend") tk = tk.replace("coup d ' etat", "coup d'etat") tk = tk.replace("t ' gethuh", "t'gethuh") tk = tk.replace('``', "“") tk = tk.replace("''", "”") tk = tk.replace(" ,", ",") tk = tk.replace("( ", "(") tk = tk.replace(" )", ")") tk = tk.replace(" ”", "”") tk = tk.replace(" 's", "'s") tk = tk.replace("o '", "o'") tk = tk.replace("s ' ", "s' ") tk = tk.replace(" , ", ", ") # tk = tk.replace(" ' ", "' ") return tk
def iterparse(self, path): tree = etree.iterparse(self.files.abspath(path), events=('start', 'end')) filename = 'n/a' para = 'n/a' for event, element in tree: if event == 'start': if element.tag == 'context': filename = element.get('filename') elif element.tag == 'p': para = element.get('pnum') if event == 'end': if element.tag == 's': # found a sentence snum = element.get('snum') tokens = [] for token in element: token_data = dict(token.attrib) token_data['tag'] = token.tag text = fix_token_text(token.text) if token.tag == 'wf': # create sensekey lemma = StringTool.strip(token.get('lemma')) lexsn = StringTool.strip(token.get('lexsn')) sk = lemma + '%' + lexsn if lemma and lexsn else '' sk = StringTool.strip( sk.replace('\t', ' ').replace('|', ' ')) if sk: token_data['sk'] = sk tokens.append(TokenInfo(text, **token_data)) elif token.tag == 'punc': tokens.append(TokenInfo(text, **token_data)) element.clear() s = { 'para': para, 'filename': filename, 'snum': snum, 'sid': "{}-{}-{}".format(filename, para, snum), 'tokens': tokens } yield s elif element.tag == 'p': para = 'n/a' element.clear() elif element.tag == 'context': filename = 'n/a' element.clear()
def tag_glossitem(self, id_node, glossitem, tag_obj): ''' Parse ID element and tag a glossitem ''' sk = StringTool.strip(id_node.get('sk')) origid = StringTool.strip(id_node.get('id')) coll = StringTool.strip(id_node.get('coll')) lemma = StringTool.strip(id_node.get('lemma')) if tag_obj is None: tag_obj = glossitem.gloss.tag_item(glossitem, '', '', '', '', '', coll, origid, '', sk, lemma) else: tag_obj.sk = sk tag_obj.origid = origid tag_obj.coll = coll tag_obj.lemma = lemma # WEIRD STUFF: lemma="purposefully ignored" sk="purposefully_ignored%0:00:00::" if lemma == 'purposefully ignored' and sk == "purposefully_ignored%0:00:00::": tag_obj.cat = 'PURPOSEFULLY_IGNORED'
def iterparse(self, path): tree = etree.iterparse(self.files.abspath(path), events=('start', 'end')) filename = 'n/a' para = 'n/a' for event, element in tree: if event == 'start': if element.tag == 'context': filename = element.get('filename') elif element.tag == 'p': para = element.get('pnum') if event == 'end': if element.tag == 's': # found a sentence snum = element.get('snum') tokens = [] for token in element: token_data = dict(token.attrib) token_data['tag'] = token.tag text = fix_token_text(token.text) if token.tag == 'wf': # create sensekey lemma = StringTool.strip(token.get('lemma')) lexsn = StringTool.strip(token.get('lexsn')) sk = lemma + '%' + lexsn if lemma and lexsn else '' sk = StringTool.strip(sk.replace('\t', ' ').replace('|', ' ')) if sk: token_data['sk'] = sk tokens.append(TokenInfo(text, **token_data)) elif token.tag == 'punc': tokens.append(TokenInfo(text, **token_data)) element.clear() s = {'para': para, 'filename': filename, 'snum': snum, 'sid': "{}-{}-{}".format(filename, para, snum), 'tokens': tokens} yield s elif element.tag == 'p': para = 'n/a' element.clear() elif element.tag == 'context': filename = 'n/a' element.clear()
def tag_glossitem(self, id_node, glossitem, tag_obj): ''' Parse ID element and tag a glossitem ''' sk = StringTool.strip(id_node.get('sk')) origid = StringTool.strip(id_node.get('id')) coll = StringTool.strip(id_node.get('coll')) lemma = StringTool.strip(id_node.get('lemma')) if tag_obj is None: tag_obj = glossitem.gloss.tag_item(glossitem, '', '', '', '', '', coll, origid, '', sk, lemma) else: tag_obj.itemid = glossitem.origid tag_obj.sk = sk tag_obj.origid = origid tag_obj.coll = coll tag_obj.lemma = lemma # WEIRD STUFF: lemma="purposefully ignored" sk="purposefully_ignored%0:00:00::" if lemma == 'purposefully ignored' and sk == "purposefully_ignored%0:00:00::": tag_obj.cat = 'PURPOSEFULLY_IGNORED'
def parse_synset(self, element): synset = GlossedSynset(element.get('id')) for child in element: if child.tag == 'terms': for grandchild in child: # term is a lemma if grandchild.tag == 'term': synset.add_lemma(StringTool.strip(grandchild.text)) elif child.tag == 'keys': for grandchild in child: if grandchild.tag == 'sk': synset.add_key(StringTool.strip(grandchild.text)) elif child.tag == 'gloss' and child.get( 'desc') == 'orig' and not self.memory_save: if child[0].tag == 'orig': synset.add_raw_gloss(GlossRaw.ORIG, StringTool.strip(child[0].text)) elif child.tag == 'gloss' and child.get( 'desc') == 'text' and not self.memory_save: if child[0].tag == 'text': synset.add_raw_gloss(GlossRaw.TEXT, StringTool.strip(child[0].text)) elif child.tag == 'gloss' and child.get('desc') == 'wsd': for grandchild in child: # [2016-02-12 LTA] aux should be parsed as well if grandchild.tag in ('def', 'ex', 'aux'): gloss = synset.add_gloss( grandchild.get('id'), StringTool.strip(grandchild.tag)) self.parse_gloss(grandchild, gloss) # rip definition pass #print("A synset") # print len(element) #print ','.join([ '%s (%s)' % (x.tag, ','.join([y.tag for y in x])) for x in element ]) return synset
def parse_wf(self, wf_node, gloss): ''' Parse a word feature node and then add to gloss object ''' tag = wf_node.get('tag') if not self.memory_save else '' lemma = wf_node.get('lemma') if not self.memory_save else '' pos = wf_node.get('pos') cat = wf_node.get('type') # if wf_node.get('type') else 'wf' coll = None # wf_node.get('coll') rdf = wf_node.get('rdf') origid = wf_node.get('id') sep = wf_node.get('sep') text = StringTool.strip(wf_node.xpath("string()")) # XML mixed content, don't use text attr here wf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid, sep, text) # Then parse id tag if available for child in wf_node: if child.tag == 'id': self.tag_glossitem(child, wf_obj, None) return wf_obj
def parse_wf(self, wf_node, gloss): ''' Parse a word feature node and then add to gloss object ''' tag = wf_node.get('tag') if not self.memory_save else '' lemma = wf_node.get('lemma') if not self.memory_save else '' pos = wf_node.get('pos') cat = wf_node.get('type') # if wf_node.get('type') else 'wf' coll = None # wf_node.get('coll') rdf = wf_node.get('rdf') origid = wf_node.get('id') sep = wf_node.get('sep') text = StringTool.strip(wf_node.xpath( "string()")) # XML mixed content, don't use text attr here wf_obj = gloss.add_gloss_item(tag, lemma, pos, cat, coll, rdf, origid, sep, text, origid) # Then parse id tag if available for child in wf_node: if child.tag == 'id': self.tag_glossitem(child, wf_obj, None) return wf_obj
def __init__(self, synset, cat, gloss): self.synset = synset self.cat = StringTool.strip(cat) self.gloss = StringTool.strip(gloss)