def load_lexunits(germanet, tree):
    """
    Takes the XML tree and walks trough it to create the Lexunit objects.
    :param germanet: the germanet object
    :param tree: XML tree
    """
    root = tree.getroot()
    for child in root:
        attribute = child.attrib
        syn_id = attribute[SYNID]
        category = get_attribute_element(attribute, WORDCATEGORY, WordCategory)
        word_class = get_attribute_element(attribute, WORDCLASS, WordClass)
        synset = Synset(syn_id, category, word_class)
        germanet.synsets[synset.id] = synset

        for sub_child in child:
            if sub_child.tag == LEXUNIT:
                lexunit = create_lexunit(germanet, sub_child.attrib, sub_child,
                                         synset)
                germanet.lexunits[lexunit.id] = lexunit
                germanet.wordcat2lexid[category.name].add(lexunit.id)
                germanet.wordclass2lexid[word_class.name].add(lexunit.id)
                synset.add_lexunit(lexunit)
        for unit in synset.lexunits:
            for lexunit in synset.lexunits:
                if lexunit is not unit:
                    unit.relations[LexRel.has_synonym].add(lexunit)
Beispiel #2
0
    def __init__(self):
        ContentHandler.__init__(self)
        self._locator = Locator()  # Dummy setDocumentLocator does the same!
        self.setDocumentLocator(self._locator)
        self.m_lcnt = 0  # input line number
        self.m_ppath = [
        ]  # contains the XML path to the current node (names of the ancestors)
        self.m_done = -1  # -1: not started synset yet, 0: inside synset, 1: done with synset
        self.m_syns = Synset()  # points to the output struct
        self.m_syns_list = []  # points to the output struct

        self.m_ilrs0_temp = ''  # Temp vars for Tuples (std::pair in C++)
        self.m_ilrs1_temp = ''

        self.m_sumolinks0_temp = ''
        self.m_sumolinks1_temp = ''

        self.m_elrs0_temp = ''
        self.m_elrs1_temp = ''

        self.m_elrs30_temp = ''
        self.m_elrs31_temp = ''

        self.m_ekszlinks0_temp = ''
        self.m_ekszlinks1_temp = ''

        self.m_vframelinks0_temp = ''
        self.m_vframelinks1_temp = ''

        self.m_startroot = False  # was there a starting root tag?
        self.m_endroot = False  # was there an end root tag?
 def synsets_get_generator(self):
     '''
     create generator of Synset elements
     (based on xml path in self.path_to_synset_els)
     
     @rtype: generator
     @return: generator of Synset XML elements
     '''
     for synset_el in self.doc.iterfind(self.path_to_synset_els):
         yield Synset(synset_el,
                      self.reltypes,
                      self.syn_ids) 
class Link:
    """ A class that represents two linked synsets 
    
    Args:
        synsetid1 (int): the synsetid of the first synset in wn_db
        synsetid1 (int): the synsetid of the second synset in wn_db
        linktyp (str): what kind of link it is
        wn_db (sqlite cursor): a sqlite cursor that can access WordNet information

    Attributes:
        linktyp (str): what kind of link it is
        synset1 (Synset): obj of the first synset
        synset2 (Synset): obj of the second synset
    """
    def __init__(self, synset1id, synset2id, linktyp, wn_db, with_gloss=False):
        self.synset1 = Synset(synset1id, wn_db, get_glosss=with_gloss)
        self.synset2 = Synset(synset2id, wn_db, get_glosss=with_gloss)
        self.linktyp = linktyp

    def __repr__(self):
        return self.synset1.__repr__() + "-->" + self.synset2.__repr__()

    def __str__(self):
        return self.synset1.__repr__() + "-->" + self.synset2.__repr__()

    def dif_vector(self, word_vectors):
        if not (self.synset1.is_in(word_vectors)
                and self.synset2.is_in(word_vectors)):
            return None
        v1 = self.synset1.vectorize1(word_vectors)
        v2 = self.synset2.vectorize1(word_vectors)
        return v1 - v2

    def as_list(self):
        l = [
            self.synset1.synsetid, self.synset2.synsetid, self.synset1.pos,
            self.synset2.pos,
            " ".join(self.synset1.words).encode('ascii',
                                                'ignore').decode('ascii'),
            " ".join(self.synset2.words).encode('ascii',
                                                'ignore').decode('ascii'),
            self.synset1.gloss.encode('ascii', 'ignore').decode('ascii'),
            self.synset2.gloss.encode('ascii', 'ignore').decode('ascii')
        ]
        return l
 def __init__(self, synset1id, synset2id, linktyp, wn_db, with_gloss=False):
     self.synset1 = Synset(synset1id, wn_db, get_glosss=with_gloss)
     self.synset2 = Synset(synset2id, wn_db, get_glosss=with_gloss)
     self.linktyp = linktyp
Beispiel #6
0
    def endElement(self, name):
        if DEBUG:
            print('(',
                  self._locator.getLineNumber(),
                  ', ',
                  self._locator.getColumnNumber(),
                  '): /',
                  '/'.join(self.m_ppath),
                  '/END: ',
                  name,
                  sep='')

        if len(self.m_ppath) >= 2:
            parent = self.m_ppath[-2]
        else:
            parent = ''

        if name == 'WNXML':  # WNXML
            self.m_endroot = True

        elif name == 'SYNSET':  # SYNSET
            if self.m_done != 0:
                raise WNXMLParserException(
                    'This is impossible!\nThe parser should\'ve caught this error:'
                    ' \'SYNSET\' end tag without previous begin tag')
            self.m_done = 1
            self.m_syns_list.append((self.m_syns, self.m_lcnt))
            self.m_syns = Synset()

        elif name == 'ILR' and parent == 'SYNSET':
            self.m_syns.ilrs.append((self.m_ilrs0_temp, self.m_ilrs1_temp))
            self.m_ilrs0_temp = ''
            self.m_ilrs1_temp = ''

        elif name == 'SUMO' and parent == 'SYNSET':
            self.m_syns.sumolinks.append(
                (self.m_sumolinks0_temp, self.m_sumolinks1_temp))
            self.m_sumolinks0_temp = ''
            self.m_sumolinks1_temp = ''

        elif name == 'ELR' and parent == 'SYNSET':
            self.m_syns.elrs.append((self.m_elrs0_temp, self.m_elrs1_temp))
            self.m_elrs0_temp = ''
            self.m_elrs1_temp = ''

        elif name == 'ELR3' and parent == 'SYNSET':
            self.m_syns.elrs3.append((self.m_elrs30_temp, self.m_elrs31_temp))
            self.m_elrs30_temp = ''
            self.m_elrs31_temp = ''

        elif name == 'EKSZ' and parent == 'SYNSET':
            self.m_syns.ekszlinks.append(
                (self.m_ekszlinks0_temp, self.m_ekszlinks1_temp))
            self.m_ekszlinks0_temp = ''
            self.m_ekszlinks1_temp = ''

        elif name == 'VFRAME' and parent == 'SYNSET':
            self.m_syns.vframelinks.append(
                (self.m_vframelinks0_temp, self.m_vframelinks1_temp))
            self.m_vframelinks0_temp = ''
            self.m_vframelinks1_temp = ''

        self.m_ppath.pop()
    def synsets_add_synset(self,
                           sy_id,
                           synset_provenance,
                           definition,
                           rels):
        '''
        synset is added if it has a hypernym relation to an existing 
        synset.
        
        WARNING not added if:
        (1) sy_id already exists
        (2) no succesful hypernym relation added (except for adjectives)
        
        @type  sy_id: str
        @param sy_id: synset identifier
        
        @type  synset_provenance: str
        @param synset_provenance: origin english synset: 'pwn', else 'odwn'
        
        @type  definition: str
        @param definition: definition
        
        @type  rels: list
        @param rels: list of tuples (reltype,target)
        
        @return: tuple
        @returun: (succes,message)
        '''    
        if not hasattr(self, 'ili_dict'):
            ili_nt_path = os.path.join(self.cwd,'resources','ili.nt.gz')  
            infile = gzip.GzipFile(ili_nt_path)
            self.set_ili_dict(infile)
        
        #get ili
        if sy_id not in self.ili_dict:
            return (False,'no ili identifier found for %s' % sy_id)
        
        ili = self.ili_dict[sy_id] 
        
        #check if sy_id already exists
        if sy_id in self.syn_ids:
            return (False,'synset exists already: %s' % sy_id)
        
        added_hypernym_rel = False
        
        base = '''<Synset id="{sy_id}" ili="{ili}">
<Definitions>
    <Definition gloss="{definition}" language="en" provenance="{synset_provenance}"/>
</Definitions>
<SynsetRelations/>
<MonolingualExternalRefs/>
</Synset>'''.format(**locals())
        synset_el = etree.fromstring(base)
        
        sy_obj = Synset(synset_el,self.reltypes,self.syn_ids)
        
        for reltype,target in rels:
            succes,message = sy_obj.add_relation(reltype,target)
            if all([reltype == 'has_hyperonym',
                    succes]):
                added_hypernym_rel = True
                
        if any([added_hypernym_rel,
                sy_id.endswith('a')]):
            self.lexicon_el.append(sy_obj.synset_el)
            return (True,'succes')
        else:
            return (False,'no hypernym rel added')
Beispiel #8
0
    def _load_from_xml(self, filename: str):
        self._clean()

        parser = et.XMLParser(encoding="utf-8")
        root = et.parse(filename, parser).getroot()

        for child in root:
            synset = None

            for element in child:
                if element.tag == 'ID':
                    synset = Synset(element.text)

                if element.tag == 'POS':
                    dic_chr2pos = {
                        'n': Synset.Pos.NOUN,
                        'v': Synset.Pos.VERB,
                        'r': Synset.Pos.ADVERB,
                        'a': Synset.Pos.ADJECTIVE
                    }
                    pos = dic_chr2pos[element.text]
                    synset.pos = pos

                if element.tag == 'SYNONYM':
                    try:
                        synset.literals = [literal.text for literal in element]
                    except TypeError as e:
                        print(synset.id)

                    literals_senses = []
                    for literal in element:
                        literals_senses.append(literal[0].text if literal[0].
                                               text is not None else "")
                    synset.literals_senses = literals_senses

                    for literal in synset.literals:
                        literal_parts = literal.split('_')
                        if len(literal_parts) > 1:
                            for literal_part in literal_parts:
                                if literal_part not in synset.literals:
                                    synset.add_literal(literal_part)

                    for literal in synset.literals:
                        self._literal2synset[literal].append(synset.id)

                if element.tag == 'STAMP':
                    synset.stamp = element.text

                if element.tag == 'ILR':
                    self._relation_types.add(element[0].text)

                    self._graph.add_edge(synset.id,
                                         element.text,
                                         label=element[0].text)

                if element.tag == 'DEF':
                    synset.definition = element.text

                if element.tag == 'DOMAIN':
                    synset.domain = element.text

                if element.tag == 'SUMO':
                    synset.sumo = element.text
                    dic_chr2sumotype = {
                        '+': Synset.SumoType.HYPERNYM,
                        '=': Synset.SumoType.EQUIVALENT,
                        '@': Synset.SumoType.INSTANCE,
                        '[': Synset.SumoType.BRACKET,
                        ':': Synset.SumoType.POINTS
                    }
                    sumotype = dic_chr2sumotype[element[0].text]
                    synset.sumotype = sumotype

                if element.tag == 'SENTIWN':
                    synset.sentiwn = [
                        float(subelement.text) for subelement in element
                    ]

            self._synsets[synset.id] = synset
Beispiel #9
0
 def loader(pos=self.pos, offset=offset, dataFile=self.dataFile):
     from synset import Synset
     dataFile.seek(offset)
     line = dataFile.readline()
     return Synset(pos, offset, line)
Beispiel #10
0
def demo_create_and_edit_synsets():
    print("\n\nThis demo shows how to create and edit synsets & relations.\n" +
          "_" * 70)

    # create a synset( it's recommended to use the function 'generate_synset_id'
    # from the rowordnet class. See the function "demo_basic_rowordnet_operations'
    # for more details
    id = "my_id"
    synset = Synset(id)
    print("\n\tSynset with id '{}' has been created.".format(id))

    # printing the synset
    print("\n\tPrint this synset:")
    print(synset)

    # set a pos of type verb
    pos = Synset.Pos.VERB
    synset.pos = pos
    print("\tSynset's pos has been changed to '{}'".format(synset.pos))

    # add a literal
    literal = "tigru"
    sense = "1"
    synset.add_literal(literal=literal, sense=sense)
    print(
        "\n\tA new literal '{}' with sense '{}' has been added to the synset with id '{}'"
        .format(literal, sense, synset.id))
    print("\tNumber of literals for synset with id '{}': {}".format(
        synset.id, len(synset.literals)))

    # remove a literal
    literal = "tigru"
    synset.remove_literal(literal=literal)
    print("\n\tThe literal '{}' has been removed from the synset with id '{}'".
          format(literal, synset.id))
    print("\tNumber of literals for synset with id '{}': {}".format(
        synset.id, len(synset.literals)))

    # add more literals at once
    print("\n\tAdding literals to a synset. Initially we create them:")
    literals = ['lup', 'vuple', 'caine']
    print("\tDirect addition of {} literals to synset with id '{}'".format(
        len(literals), synset.id))
    synset.literals = literals
    print("\tNumber of literals for synset with id '{}': {}".format(
        synset.id, len(synset.literals)))

    # add more senses at once
    print(
        "\n\tAdding senses to a synset's literals. Initially we create them:")
    literals_senses = ['1', '2', 'x']
    print("\tDirect addition of {} senses to synset with id '{}'".format(
        len(literals_senses), synset.id))
    synset.literals_senses = literals_senses
    print("\tNumber of senses for synset '{}': {}".format(
        synset.id, len(synset.literals_senses)))

    # set a definition
    definition = "Animal carnivor"
    synset.definition = definition
    print("\tSynset's defition has been changed to '{}'".format(
        synset.definition))

    # set a sumo
    sumo = "Animal"
    synset.sumo = sumo
    print("\tSynset's sumo has been changed to '{}'".format(synset.sumo))

    # set a sumotype
    sumotype = Synset.SumoType.INSTANCE
    synset.sumotype = sumotype
    print("\tSynset's sumotype has been changed to '{}'".format(
        synset.sumotype))

    # generate a new id with default prefix and suffix
    wn = rowordnet.RoWordNet()
    id = wn.generate_synset_id()
    print(
        "\n\tNew id '{}' generated with default prefix 'ENG30-' and suffix '-n'"
        .format(id))
    # generate a new id with custom prefix and suffix
    prefix = 'ENG31-'
    suffix = '-v'
    new_id = wn.generate_synset_id(prefix=prefix, suffix=suffix)
    print("\tNew id '{}' generated with prefix '{}' and suffix '{}'".format(
        new_id, prefix, suffix))

    # create a synset with previous id
    synset = Synset(id)
    print("\n\tSynset with id '{}' has been created".format(synset.id))
    # add the synset to the rowordnet
    wn.add_synset(synset)
    print("\n\tAdded synset with id '{}' to the rowordnet".format(synset.id))

    # add a literal to synset
    literal = 'iepure'
    sense = '1'
    # get a synset
    synset_id = wn.synsets()[0]
    synset = wn(synset_id)
    # add a literal to the synset
    synset.add_literal(literal, sense)
    # tell the rowordnet that synsets's literals have been changed. This step is
    # necessary for a correct internal representation.
    wn.reindex_literals()
    print(
        "\n\tAdded literal with literal '{}' and sense '{}' to the synset '{}'. "
        "Number of synsets containing literal '{}': {}".format(
            literal, sense, synset.id, literal, len(wn.synsets(literal))))

    # remove the previous literal from synset.
    synset.remove_literal(literal)
    # again, we have to tell the rowordnet that synset's literals have been
    # changed.
    wn.reindex_literals()
    print(
        "\tRemoved literal with literal '{}' from the synset '{}'. Number of synsets containing literal '{}': {}"
        .format(literal, synset.id, literal, len(wn.synsets(literal))))

    # generate a new synset
    prefix = 'ENG31-'
    suffix = '-n'
    new_id = wn.generate_synset_id(prefix, suffix)
    new_synset = Synset(new_id)
    wn.add_synset(new_synset)
    print("\n\tAdded new synset with id '{}' to the rowordnet".format(
        new_synset.id))

    # add a relation of type 'hypernym' from 'synset' to 'new_synset'
    relation = 'hypernym'
    wn.add_relation(synset.id, new_synset.id, relation)
    print(
        "\n\tAdded '{}' relation from synset with id '{}' to synset with id '{}'"
        .format(relation, synset.id, new_synset.id))

    # remove relation of type 'hypernym' from 'synset' to 'new_synset'
    wn.remove_relation(synset.id, new_synset.id)
    print("\tRemoved relation from synset with id '{}' to synset with id '{}'".
          format(synset.id, new_synset.id))