def load_lexunits(germanet, tree): """ Takes the XML tree and walks trough it to create the Lexunit objects. :param germanet: the germanet object :param tree: XML tree """ root = tree.getroot() for child in root: attribute = child.attrib syn_id = attribute[SYNID] category = get_attribute_element(attribute, WORDCATEGORY, WordCategory) word_class = get_attribute_element(attribute, WORDCLASS, WordClass) synset = Synset(syn_id, category, word_class) germanet.synsets[synset.id] = synset for sub_child in child: if sub_child.tag == LEXUNIT: lexunit = create_lexunit(germanet, sub_child.attrib, sub_child, synset) germanet.lexunits[lexunit.id] = lexunit germanet.wordcat2lexid[category.name].add(lexunit.id) germanet.wordclass2lexid[word_class.name].add(lexunit.id) synset.add_lexunit(lexunit) for unit in synset.lexunits: for lexunit in synset.lexunits: if lexunit is not unit: unit.relations[LexRel.has_synonym].add(lexunit)
def __init__(self): ContentHandler.__init__(self) self._locator = Locator() # Dummy setDocumentLocator does the same! self.setDocumentLocator(self._locator) self.m_lcnt = 0 # input line number self.m_ppath = [ ] # contains the XML path to the current node (names of the ancestors) self.m_done = -1 # -1: not started synset yet, 0: inside synset, 1: done with synset self.m_syns = Synset() # points to the output struct self.m_syns_list = [] # points to the output struct self.m_ilrs0_temp = '' # Temp vars for Tuples (std::pair in C++) self.m_ilrs1_temp = '' self.m_sumolinks0_temp = '' self.m_sumolinks1_temp = '' self.m_elrs0_temp = '' self.m_elrs1_temp = '' self.m_elrs30_temp = '' self.m_elrs31_temp = '' self.m_ekszlinks0_temp = '' self.m_ekszlinks1_temp = '' self.m_vframelinks0_temp = '' self.m_vframelinks1_temp = '' self.m_startroot = False # was there a starting root tag? self.m_endroot = False # was there an end root tag?
def synsets_get_generator(self): ''' create generator of Synset elements (based on xml path in self.path_to_synset_els) @rtype: generator @return: generator of Synset XML elements ''' for synset_el in self.doc.iterfind(self.path_to_synset_els): yield Synset(synset_el, self.reltypes, self.syn_ids)
class Link: """ A class that represents two linked synsets Args: synsetid1 (int): the synsetid of the first synset in wn_db synsetid1 (int): the synsetid of the second synset in wn_db linktyp (str): what kind of link it is wn_db (sqlite cursor): a sqlite cursor that can access WordNet information Attributes: linktyp (str): what kind of link it is synset1 (Synset): obj of the first synset synset2 (Synset): obj of the second synset """ def __init__(self, synset1id, synset2id, linktyp, wn_db, with_gloss=False): self.synset1 = Synset(synset1id, wn_db, get_glosss=with_gloss) self.synset2 = Synset(synset2id, wn_db, get_glosss=with_gloss) self.linktyp = linktyp def __repr__(self): return self.synset1.__repr__() + "-->" + self.synset2.__repr__() def __str__(self): return self.synset1.__repr__() + "-->" + self.synset2.__repr__() def dif_vector(self, word_vectors): if not (self.synset1.is_in(word_vectors) and self.synset2.is_in(word_vectors)): return None v1 = self.synset1.vectorize1(word_vectors) v2 = self.synset2.vectorize1(word_vectors) return v1 - v2 def as_list(self): l = [ self.synset1.synsetid, self.synset2.synsetid, self.synset1.pos, self.synset2.pos, " ".join(self.synset1.words).encode('ascii', 'ignore').decode('ascii'), " ".join(self.synset2.words).encode('ascii', 'ignore').decode('ascii'), self.synset1.gloss.encode('ascii', 'ignore').decode('ascii'), self.synset2.gloss.encode('ascii', 'ignore').decode('ascii') ] return l
def __init__(self, synset1id, synset2id, linktyp, wn_db, with_gloss=False): self.synset1 = Synset(synset1id, wn_db, get_glosss=with_gloss) self.synset2 = Synset(synset2id, wn_db, get_glosss=with_gloss) self.linktyp = linktyp
def endElement(self, name): if DEBUG: print('(', self._locator.getLineNumber(), ', ', self._locator.getColumnNumber(), '): /', '/'.join(self.m_ppath), '/END: ', name, sep='') if len(self.m_ppath) >= 2: parent = self.m_ppath[-2] else: parent = '' if name == 'WNXML': # WNXML self.m_endroot = True elif name == 'SYNSET': # SYNSET if self.m_done != 0: raise WNXMLParserException( 'This is impossible!\nThe parser should\'ve caught this error:' ' \'SYNSET\' end tag without previous begin tag') self.m_done = 1 self.m_syns_list.append((self.m_syns, self.m_lcnt)) self.m_syns = Synset() elif name == 'ILR' and parent == 'SYNSET': self.m_syns.ilrs.append((self.m_ilrs0_temp, self.m_ilrs1_temp)) self.m_ilrs0_temp = '' self.m_ilrs1_temp = '' elif name == 'SUMO' and parent == 'SYNSET': self.m_syns.sumolinks.append( (self.m_sumolinks0_temp, self.m_sumolinks1_temp)) self.m_sumolinks0_temp = '' self.m_sumolinks1_temp = '' elif name == 'ELR' and parent == 'SYNSET': self.m_syns.elrs.append((self.m_elrs0_temp, self.m_elrs1_temp)) self.m_elrs0_temp = '' self.m_elrs1_temp = '' elif name == 'ELR3' and parent == 'SYNSET': self.m_syns.elrs3.append((self.m_elrs30_temp, self.m_elrs31_temp)) self.m_elrs30_temp = '' self.m_elrs31_temp = '' elif name == 'EKSZ' and parent == 'SYNSET': self.m_syns.ekszlinks.append( (self.m_ekszlinks0_temp, self.m_ekszlinks1_temp)) self.m_ekszlinks0_temp = '' self.m_ekszlinks1_temp = '' elif name == 'VFRAME' and parent == 'SYNSET': self.m_syns.vframelinks.append( (self.m_vframelinks0_temp, self.m_vframelinks1_temp)) self.m_vframelinks0_temp = '' self.m_vframelinks1_temp = '' self.m_ppath.pop()
def synsets_add_synset(self, sy_id, synset_provenance, definition, rels): ''' synset is added if it has a hypernym relation to an existing synset. WARNING not added if: (1) sy_id already exists (2) no succesful hypernym relation added (except for adjectives) @type sy_id: str @param sy_id: synset identifier @type synset_provenance: str @param synset_provenance: origin english synset: 'pwn', else 'odwn' @type definition: str @param definition: definition @type rels: list @param rels: list of tuples (reltype,target) @return: tuple @returun: (succes,message) ''' if not hasattr(self, 'ili_dict'): ili_nt_path = os.path.join(self.cwd,'resources','ili.nt.gz') infile = gzip.GzipFile(ili_nt_path) self.set_ili_dict(infile) #get ili if sy_id not in self.ili_dict: return (False,'no ili identifier found for %s' % sy_id) ili = self.ili_dict[sy_id] #check if sy_id already exists if sy_id in self.syn_ids: return (False,'synset exists already: %s' % sy_id) added_hypernym_rel = False base = '''<Synset id="{sy_id}" ili="{ili}"> <Definitions> <Definition gloss="{definition}" language="en" provenance="{synset_provenance}"/> </Definitions> <SynsetRelations/> <MonolingualExternalRefs/> </Synset>'''.format(**locals()) synset_el = etree.fromstring(base) sy_obj = Synset(synset_el,self.reltypes,self.syn_ids) for reltype,target in rels: succes,message = sy_obj.add_relation(reltype,target) if all([reltype == 'has_hyperonym', succes]): added_hypernym_rel = True if any([added_hypernym_rel, sy_id.endswith('a')]): self.lexicon_el.append(sy_obj.synset_el) return (True,'succes') else: return (False,'no hypernym rel added')
def _load_from_xml(self, filename: str): self._clean() parser = et.XMLParser(encoding="utf-8") root = et.parse(filename, parser).getroot() for child in root: synset = None for element in child: if element.tag == 'ID': synset = Synset(element.text) if element.tag == 'POS': dic_chr2pos = { 'n': Synset.Pos.NOUN, 'v': Synset.Pos.VERB, 'r': Synset.Pos.ADVERB, 'a': Synset.Pos.ADJECTIVE } pos = dic_chr2pos[element.text] synset.pos = pos if element.tag == 'SYNONYM': try: synset.literals = [literal.text for literal in element] except TypeError as e: print(synset.id) literals_senses = [] for literal in element: literals_senses.append(literal[0].text if literal[0]. text is not None else "") synset.literals_senses = literals_senses for literal in synset.literals: literal_parts = literal.split('_') if len(literal_parts) > 1: for literal_part in literal_parts: if literal_part not in synset.literals: synset.add_literal(literal_part) for literal in synset.literals: self._literal2synset[literal].append(synset.id) if element.tag == 'STAMP': synset.stamp = element.text if element.tag == 'ILR': self._relation_types.add(element[0].text) self._graph.add_edge(synset.id, element.text, label=element[0].text) if element.tag == 'DEF': synset.definition = element.text if element.tag == 'DOMAIN': synset.domain = element.text if element.tag == 'SUMO': synset.sumo = element.text dic_chr2sumotype = { '+': Synset.SumoType.HYPERNYM, '=': Synset.SumoType.EQUIVALENT, '@': Synset.SumoType.INSTANCE, '[': Synset.SumoType.BRACKET, ':': Synset.SumoType.POINTS } sumotype = dic_chr2sumotype[element[0].text] synset.sumotype = sumotype if element.tag == 'SENTIWN': synset.sentiwn = [ float(subelement.text) for subelement in element ] self._synsets[synset.id] = synset
def loader(pos=self.pos, offset=offset, dataFile=self.dataFile): from synset import Synset dataFile.seek(offset) line = dataFile.readline() return Synset(pos, offset, line)
def demo_create_and_edit_synsets(): print("\n\nThis demo shows how to create and edit synsets & relations.\n" + "_" * 70) # create a synset( it's recommended to use the function 'generate_synset_id' # from the rowordnet class. See the function "demo_basic_rowordnet_operations' # for more details id = "my_id" synset = Synset(id) print("\n\tSynset with id '{}' has been created.".format(id)) # printing the synset print("\n\tPrint this synset:") print(synset) # set a pos of type verb pos = Synset.Pos.VERB synset.pos = pos print("\tSynset's pos has been changed to '{}'".format(synset.pos)) # add a literal literal = "tigru" sense = "1" synset.add_literal(literal=literal, sense=sense) print( "\n\tA new literal '{}' with sense '{}' has been added to the synset with id '{}'" .format(literal, sense, synset.id)) print("\tNumber of literals for synset with id '{}': {}".format( synset.id, len(synset.literals))) # remove a literal literal = "tigru" synset.remove_literal(literal=literal) print("\n\tThe literal '{}' has been removed from the synset with id '{}'". format(literal, synset.id)) print("\tNumber of literals for synset with id '{}': {}".format( synset.id, len(synset.literals))) # add more literals at once print("\n\tAdding literals to a synset. Initially we create them:") literals = ['lup', 'vuple', 'caine'] print("\tDirect addition of {} literals to synset with id '{}'".format( len(literals), synset.id)) synset.literals = literals print("\tNumber of literals for synset with id '{}': {}".format( synset.id, len(synset.literals))) # add more senses at once print( "\n\tAdding senses to a synset's literals. Initially we create them:") literals_senses = ['1', '2', 'x'] print("\tDirect addition of {} senses to synset with id '{}'".format( len(literals_senses), synset.id)) synset.literals_senses = literals_senses print("\tNumber of senses for synset '{}': {}".format( synset.id, len(synset.literals_senses))) # set a definition definition = "Animal carnivor" synset.definition = definition print("\tSynset's defition has been changed to '{}'".format( synset.definition)) # set a sumo sumo = "Animal" synset.sumo = sumo print("\tSynset's sumo has been changed to '{}'".format(synset.sumo)) # set a sumotype sumotype = Synset.SumoType.INSTANCE synset.sumotype = sumotype print("\tSynset's sumotype has been changed to '{}'".format( synset.sumotype)) # generate a new id with default prefix and suffix wn = rowordnet.RoWordNet() id = wn.generate_synset_id() print( "\n\tNew id '{}' generated with default prefix 'ENG30-' and suffix '-n'" .format(id)) # generate a new id with custom prefix and suffix prefix = 'ENG31-' suffix = '-v' new_id = wn.generate_synset_id(prefix=prefix, suffix=suffix) print("\tNew id '{}' generated with prefix '{}' and suffix '{}'".format( new_id, prefix, suffix)) # create a synset with previous id synset = Synset(id) print("\n\tSynset with id '{}' has been created".format(synset.id)) # add the synset to the rowordnet wn.add_synset(synset) print("\n\tAdded synset with id '{}' to the rowordnet".format(synset.id)) # add a literal to synset literal = 'iepure' sense = '1' # get a synset synset_id = wn.synsets()[0] synset = wn(synset_id) # add a literal to the synset synset.add_literal(literal, sense) # tell the rowordnet that synsets's literals have been changed. This step is # necessary for a correct internal representation. wn.reindex_literals() print( "\n\tAdded literal with literal '{}' and sense '{}' to the synset '{}'. " "Number of synsets containing literal '{}': {}".format( literal, sense, synset.id, literal, len(wn.synsets(literal)))) # remove the previous literal from synset. synset.remove_literal(literal) # again, we have to tell the rowordnet that synset's literals have been # changed. wn.reindex_literals() print( "\tRemoved literal with literal '{}' from the synset '{}'. Number of synsets containing literal '{}': {}" .format(literal, synset.id, literal, len(wn.synsets(literal)))) # generate a new synset prefix = 'ENG31-' suffix = '-n' new_id = wn.generate_synset_id(prefix, suffix) new_synset = Synset(new_id) wn.add_synset(new_synset) print("\n\tAdded new synset with id '{}' to the rowordnet".format( new_synset.id)) # add a relation of type 'hypernym' from 'synset' to 'new_synset' relation = 'hypernym' wn.add_relation(synset.id, new_synset.id, relation) print( "\n\tAdded '{}' relation from synset with id '{}' to synset with id '{}'" .format(relation, synset.id, new_synset.id)) # remove relation of type 'hypernym' from 'synset' to 'new_synset' wn.remove_relation(synset.id, new_synset.id) print("\tRemoved relation from synset with id '{}' to synset with id '{}'". format(synset.id, new_synset.id))