def hazmtoalpheios(word,uri): wordslist = etree.Element("words") normalizer = Normalizer() data = normalizer.normalize(word) sentences = sent_tokenize(data) words = [] for sentence in sentences: if words: words = words.append(word_tokenize(sentence)) else: words = word_tokenize(sentence) analyses = [] for item in words: stemmer = Stemmer() wordstem = stemmer.stem(item) lemmatizer = Lemmatizer() wordlema = lemmatizer.lemmatize(item) if '#' in wordlema: worldleam, garbage = wordlema.split("#") tagger = POSTagger(model=os.path.join(model_path,"postagger.model")) wordtagged = tagger.tag(item) wordpofs = wordtagged[0][1] wordpofs = maptohazm(wordpofs) # a better way to do this would be to create a Python class # to formalize the abstraction analysis = {} analysis['engine'] = 'hazm' analysis['uri'] = uri analysis['form'] = {} analysis['form']['text'] = item analysis['form']['lang'] = 'per' analysis['entries'] = [] entry = {} entry['dict'] = {} entry['dict']['hdwd'] = {} entry['dict']['hdwd']['lang'] = 'per' entry['dict']['hdwd']['text'] = wordstem entry['infls'] = [] infl = {} infl['stem'] = {} infl['stem']['text'] = wordstem infl['stem']['lang'] = 'per' infl['pofs'] = {} if wordpofs: infl['pofs']['order'] = str(wordpofs[1]) infl['pofs']['text'] = wordpofs[0] entry['infls'].append(infl) analysis['entries'].append(entry) analyses.append(analysis) return analyses
def hazmtoalpheios(word,uri): wordslist = etree.Element("words") normalizer = Normalizer() item = normalizer.normalize(word) analyses = [] stemmer = Stemmer() wordstem = stemmer.stem(item) lemmatizer = Lemmatizer() wordlema = lemmatizer.lemmatize(item) if '#' in wordlema: worldleam, garbage = wordlema.split("#") tagger = POSTagger(model=os.path.join(model_path,"postagger.model")) wordtagged = tagger.tag(word_tokenize(item)) wordpofs = wordtagged[0][1] wordpofs = maptohazm(wordpofs) # a better way to do this would be to create a Python class # to formalize the abstraction analysis = {} analysis['engine'] = 'hazm' analysis['uri'] = uri analysis['form'] = {} analysis['form']['text'] = item analysis['form']['lang'] = 'per' analysis['entries'] = [] entry = {} entry['dict'] = {} entry['dict']['hdwd'] = {} entry['dict']['hdwd']['lang'] = 'per' entry['dict']['hdwd']['text'] = wordstem entry['infls'] = [] infl = {} infl['stem'] = {} infl['stem']['text'] = wordstem infl['stem']['lang'] = 'per' infl['pofs'] = {} if wordpofs: infl['pofs']['order'] = str(wordpofs[1]) infl['pofs']['text'] = wordpofs[0] entry['infls'].append(infl) analysis['entries'].append(entry) analyses.append(analysis) return analyses
def hazmtoalpheiosfile(data,uri): root = etree.Element("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF") oaannotation = etree.SubElement(root,'{http://www.w3.org/ns/oa#}Annotation',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':'http://services.projectbamboo.org/morphology'+uri}) oahasbody = etree.SubElement(oaannotation, '{http://www.w3.org/ns/oa#}hasBody',) oahastarget = etree.SubElement(oaannotation,'{http://www.w3.org/ns/oa#}hasTarget') hasbodydesc = etree.SubElement(oahastarget,'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri}) ispartof = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}isPartOf',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about':uri}) source = etree.SubElement(hasbodydesc,'{http://purl.org/dc/terms/}source',{'{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource':uri}) title = etree.SubElement(oaannotation, '{http://purl.org/dc/elements/1.1/}title', {'{http://www.w3.org/XML/1998/namespace}lang':'eng'}) title.text = "Morphology of " + uri wordslist = etree.SubElement("words") normalizer = Normalizer() data = normalizer.normalize(data) sentences = sent_tokenize(data) words = [] for sentence in sentences: if words: words = words.append(word_tokenize(sentence)) else: words = word_tokenize(sentence) for item in words: stemmer = Stemmer() wordstem = stemmer.stem(item) lemmatizer = Lemmatizer() wordlema = lemmatizer.lemmatize(item) if '#' in wordlema: worldleam, garbage = wordlema.split("#") tagger = POSTagger(model=os.path.join(model_path,"postagger.model")) wordtagged = tagger.tag(item) wordpofs = wordtagged[0][1] word = etree.SubElement(wordslist,'word') form = etree.SubElement(word, 'form', {'{http://www.w3.org/XML/1998/namespace}lang':'per'}) form.text = item entry = etree.SubElement(word, 'entry') infl = etree.SubElement(entry,'inlf') term = etree.SubElement(infl, 'term', {'{http://www.w3.org/XML/1998/namespace}lang':'per'}) stem = etree.SubElement(term, 'stem') stem.text = wordstem pofs = etree.SubElement(infl, 'pofs') pofs.text = wordpofs return root