Ejemplo n.º 1
0
def hazmtoalpheios(word,uri):
    wordslist = etree.Element("words")
    normalizer = Normalizer()
    data = normalizer.normalize(word)
    sentences = sent_tokenize(data)
    words = []
    for sentence in sentences:
        if words:
            words = words.append(word_tokenize(sentence))
        else:
            words = word_tokenize(sentence)
    analyses = []
    for item in words:
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        lemmatizer = Lemmatizer()
        wordlema = lemmatizer.lemmatize(item)
        if '#' in wordlema:
            worldleam, garbage = wordlema.split("#")
        tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
        wordtagged = tagger.tag(item)
        wordpofs = wordtagged[0][1]
        wordpofs = maptohazm(wordpofs)
        # a better way to do this would be to create a Python class
        # to formalize the abstraction
        analysis = {}
        analysis['engine'] = 'hazm'
        analysis['uri'] = uri
        analysis['form'] = {}
        analysis['form']['text'] = item
        analysis['form']['lang'] = 'per'
        analysis['entries'] = []
        entry = {}
        entry['dict'] = {}
        entry['dict']['hdwd'] = {}
        entry['dict']['hdwd']['lang'] = 'per'
        entry['dict']['hdwd']['text'] = wordstem
        entry['infls'] = []
        infl = {}
        infl['stem'] = {} 
        infl['stem']['text'] = wordstem
        infl['stem']['lang'] = 'per'
        infl['pofs'] = {}
        if wordpofs:
            infl['pofs']['order'] = str(wordpofs[1])
            infl['pofs']['text'] = wordpofs[0]
        entry['infls'].append(infl)
        analysis['entries'].append(entry)
        analyses.append(analysis)
    return analyses
Ejemplo n.º 2
0
 def lookup(self,
            word=None,
            word_uri=None,
            language=None,
            request_args=None,
            **kwargs):
     """ Word Lookup Function
     :param word: the word to lookup
     :type word: str
     :param word_uri: a uri for the word
     :type word_uri: str
     :param language: the language code for the word
     :type language: str
     :param request_args: dict of engine specific request arguments
     :type request_args: dict
     :return: the analysis
     :rtype: str
     """
     normalizer = Normalizer()
     item = normalizer.normalize(word)
     analyses = []
     stemmer = Stemmer()
     wordstem = stemmer.stem(item)
     wordtagged = self.tagger.tag(word_tokenize(item))
     wordpofs = wordtagged[0][1]
     wordpofs = self.maptohazm(wordpofs)
     analysis = {}
     analysis['entries'] = []
     entry = {}
     entry['dict'] = {}
     entry['dict']['hdwd'] = {}
     entry['dict']['hdwd']['lang'] = 'per'
     entry['dict']['hdwd']['text'] = wordstem
     entry['infls'] = []
     infl = {}
     infl['stem'] = {}
     infl['stem']['text'] = wordstem
     infl['stem']['lang'] = 'per'
     infl['pofs'] = {}
     if wordpofs:
         infl['pofs']['order'] = str(wordpofs[1])
         infl['pofs']['text'] = wordpofs[0]
     entry['infls'].append(infl)
     analysis['entries'].append(entry)
     analyses.append(analysis)
     return self.toalpheiosxml(analyses)
Ejemplo n.º 3
0
def hazmtoalpheios(word,uri):
    wordslist = etree.Element("words")
    normalizer = Normalizer()
    item = normalizer.normalize(word)
    analyses = []
    stemmer = Stemmer()
    wordstem = stemmer.stem(item)
    lemmatizer = Lemmatizer()
    wordlema = lemmatizer.lemmatize(item)
    if '#' in wordlema:
        worldleam, garbage = wordlema.split("#")
    tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
    wordtagged = tagger.tag(word_tokenize(item))
    wordpofs = wordtagged[0][1]
    wordpofs = maptohazm(wordpofs)
    # a better way to do this would be to create a Python class
    # to formalize the abstraction
    analysis = {}
    analysis['engine'] = 'hazm'
    analysis['uri'] = uri
    analysis['form'] = {}
    analysis['form']['text'] = item
    analysis['form']['lang'] = 'per'
    analysis['entries'] = []
    entry = {}
    entry['dict'] = {}
    entry['dict']['hdwd'] = {}
    entry['dict']['hdwd']['lang'] = 'per'
    entry['dict']['hdwd']['text'] = wordstem
    entry['infls'] = []
    infl = {}
    infl['stem'] = {} 
    infl['stem']['text'] = wordstem
    infl['stem']['lang'] = 'per'
    infl['pofs'] = {}
    if wordpofs:
        infl['pofs']['order'] = str(wordpofs[1])
        infl['pofs']['text'] = wordpofs[0]
    entry['infls'].append(infl)
    analysis['entries'].append(entry)
    analyses.append(analysis)
    return analyses
Ejemplo n.º 4
0
def hazmtoalpheiosfile(data,uri):
    root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF")    
    oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri})
    oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',)
    oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget')
    hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri})
    title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'})
    title.text = "Morphology of " + uri
    wordslist = etree.SubElement("words")
    normalizer = Normalizer()
    data = normalizer.normalize(data)
    sentences = sent_tokenize(data)
    words = []
    for sentence in sentences:
        if words:
            words = words.append(word_tokenize(sentence))
        else:
            words = word_tokenize(sentence)
    for item in words:
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        lemmatizer = Lemmatizer()
        wordlema = lemmatizer.lemmatize(item)
        if '#' in wordlema:
            worldleam, garbage = wordlema.split("#")
        tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
        wordtagged = tagger.tag(item)
        wordpofs = wordtagged[0][1]
        word = etree.SubElement(wordslist,'word')
        form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        form.text = item
        entry = etree.SubElement(word, 'entry')
        infl = etree.SubElement(entry,'inlf')
        term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        stem = etree.SubElement(term, 'stem')
        stem.text = wordstem
        pofs = etree.SubElement(infl, 'pofs')
        pofs.text = wordpofs
    return root
Ejemplo n.º 5
0
def hazmtoalpheiosfile(data,uri):
    root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF")    
    oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri})
    oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',)
    oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget')
    hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri})
    source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri})
    title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'})
    title.text = "Morphology of " + uri
    wordslist = etree.SubElement("words")
    normalizer = Normalizer()
    data = normalizer.normalize(data)
    sentences = sent_tokenize(data)
    words = []
    for sentence in sentences:
        if words:
            words = words.append(word_tokenize(sentence))
        else:
            words = word_tokenize(sentence)
    for item in words:
        stemmer = Stemmer()
        wordstem = stemmer.stem(item)
        lemmatizer = Lemmatizer()
        wordlema = lemmatizer.lemmatize(item)
        if '#' in wordlema:
            worldleam, garbage = wordlema.split("#")
        tagger = POSTagger(model=os.path.join(model_path,"postagger.model"))
        wordtagged = tagger.tag(item)
        wordpofs = wordtagged[0][1]
        word = etree.SubElement(wordslist,'word')
        form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        form.text = item
        entry = etree.SubElement(word, 'entry')
        infl = etree.SubElement(entry,'inlf')
        term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'})
        stem = etree.SubElement(term, 'stem')
        stem.text = wordstem
        pofs = etree.SubElement(infl, 'pofs')
        pofs.text = wordpofs
    return root