class SpellErrors(QMultiTerm): """ query that ignores the spell errors of arabic letters such as: - ta' marbuta and ha' - alef maqsura and ya' - hamza forms """ def __init__(self, fieldname, text, boost=1.0): self.fieldname = fieldname self.text = text self.boost = boost self.words = [text] self.ASF = QArabicSymbolsFilter(shaping=True, tashkil=False, spellerrors=True, hamza=True) def _words(self, ixreader): for field, indexed_text in ixreader.all_terms(): if field == self.fieldname: if self._compare(self.text, indexed_text): yield indexed_text def _compare(self, first, second): """ normalize and compare """ if first[:2] == u"مو": print first eqiv = (self.ASF.normalize_all(first) == self.ASF.normalize_all( second)) if eqiv: self.words.append(second) return eqiv
class SpellErrors( QMultiTerm ): """ query that ignores the spellerrors of arabic letters - ta' marbuta and ha' - alef maqsura and ya' - hamza formes """ def __init__( self, fieldname, text, boost = 1.0 ): self.fieldname = fieldname self.text = text self.boost = boost self.words = [text] self.ASF = QArabicSymbolsFilter( shaping = True, tashkil = False, spellerrors = True, hamza = True ) def _words( self, ixreader ): for field, indexed_text in ixreader.all_terms(): if field == self.fieldname: if self._compare( self.text, indexed_text ): yield indexed_text def _compare( self, first, second ): """ normalize and compare """ if first[:2] == u"مو": print first eqiv = ( self.ASF.normalize_all( first ) == self.ASF.normalize_all( second ) ) if eqiv: self.words.append( second ) return eqiv
def __init__(self, fieldname, text, boost=1.0): self.fieldname = fieldname self.text = text self.boost = boost ASF = QArabicSymbolsFilter(shaping=False, tashkil=True, spellerrors=False, hamza=False) self.words = [ASF.normalize_all(word) for word in text]
def __init__( self, fieldname, text, boost = 1.0 ): self.fieldname = fieldname self.text = text self.boost = boost ASF = QArabicSymbolsFilter( shaping = False, tashkil = True, spellerrors = False, hamza = False ) self.words = [ASF.normalize_all( word ) for word in text]
# coding: utf-8 """ This is a test module for alfanous.TextProcessing """ from alfanous.TextProcessing import QArabicSymbolsFilter, unicode_ if __name__ == "__main__": ASF = QArabicSymbolsFilter() TEXT = u"عاصِمٌ" TEXT = ASF.normalize_all(TEXT) print TEXT WORD1 = unicode_(u"عَاصِمُ") WORD2 = unicode_(u"عَاصمُ") LIST_HARAKAT1 = WORD1.list_harakat() LIST_HARAKAT2 = WORD2.list_harakat() WORD3 = unicode_(u"فاعل") PHRASE = unicode_(u"كانَ") print WORD3.apply_harakat_list(LIST_HARAKAT1) print LIST_HARAKAT1, "\n", LIST_HARAKAT2 print unicode_.compare_harakat(LIST_HARAKAT1, LIST_HARAKAT2) print WORD1.shakl_compare(WORD1, WORD2) for i in PHRASE.tokenize_shakl(): print i, WORD4 = unicode_(u"عاصم") WORD5 = unicode_(u"عاصِم") print WORD4 == WORD5
# coding: utf-8 """ This is a test module for alfanous.TextProcessing """ from alfanous.TextProcessing import QArabicSymbolsFilter, unicode_ if __name__ == "__main__": ASF = QArabicSymbolsFilter() TEXT = u"عاصِمٌ" TEXT = ASF.normalize_all( TEXT ) print TEXT WORD1 = unicode_( u"عَاصِمُ" ) WORD2 = unicode_( u"عَاصمُ" ) LIST_HARAKAT1 = WORD1.list_harakat() LIST_HARAKAT2 = WORD2.list_harakat() WORD3 = unicode_( u"فاعل" ) PHRASE = unicode_( u"كانَ" ) print WORD3.apply_harakat_list( LIST_HARAKAT1 ) print LIST_HARAKAT1, "\n", LIST_HARAKAT2 print unicode_.compare_harakat( LIST_HARAKAT1, LIST_HARAKAT2 ) print WORD1.shakl_compare( WORD1, WORD2 ) for i in PHRASE.tokenize_shakl(): print i, WORD4 = unicode_( u"عاصم" ) WORD5 = unicode_( u"عاصِم" )
def __init__( self, QC_PATH = "../../store/quranic-corpus-morpology.xml", DB = "main.db" ): """ make word table """ import sqlite3 print "connecting to database ...", maindb = sqlite3.connect( DB ) cur = maindb.cursor() print "OK" print "creating tables:" cur.execute( """ drop table if exists wordQC""" ) cur.execute( """ create table if not exists wordQC( gid int unique, word_gid int, word_id int, aya_id int, sura_id int, word varchar(25), normalised varchar(25), spelled varchar(25), 'order' int, token varchar(25), arabictoken varchar(25), prefixes varchar(25), suffixes varchar(25), pos varchar(25), type varchar(25), arabicpos varchar(25), mood varchar(25), arabicmood varchar(25), 'case' varchar(25), arabiccase varchar(25), root varchar(25), arabicroot varchar(25), lemma varchar(25), arabiclemma varchar(25), special varchar(25), arabicspecial varchar(25), derivation varchar(25), form varchar(25), gender varchar(25), person varchar(25), number varchar(25), voice varchar(25), state varchar(25), aspect varchar(25), primary key(gid) ) """ ) print ">wordQC table ... OK" print ">loading Qurany Corpus...", from PyCorpus.QuranyCorpus import API as QC A = QC( source = QC_PATH ) print ".OK\n" IFEXIST = lambda d, attrib: d[attrib].encode( "utf-8" ) if attrib in d else "" gid, word_gid = 0, 0 print ">inserting values of gid...", for iteration in A.all_words_generator(): QASF = QArabicSymbolsFilter( shaping = True, tashkil = True, spellerrors = False, hamza = False, uthmani_symbols = True ) QASF_spelled = QArabicSymbolsFilter( shaping = True, tashkil = True, spellerrors = True, hamza = True, uthmani_symbols = True ) QUERY = lambda d, glob: """insert into wordQC(gid,word_gid,word_id,aya_id,sura_id,'order',token,arabictoken,prefixes, suffixes,type,pos,arabicpos,mood, arabicmood, 'case', arabiccase, root ,arabicroot, lemma ,arabiclemma, special, arabicspecial, word,normalised,spelled, derivation, form ,gender, person, number,voice, state, aspect) values ("%(gid)d","%(word_gid)d","%(word_id)d","%(aya_id)d","%(sura_id)d","%(order)d","%(token)s","%(arabictoken)s", "%(prefixes)s", "%(suffixes)s", "%(type)s","%(pos)s","%(arabicpos)s","%(mood)s","%(arabicmood)s", "%(case)s","%(arabiccase)s","%(root)s","%(arabicroot)s","%(lemma)s","%(arabiclemma)s","%(special)s","%(arabicspecial)s","%(word)s","%(normalised)s","%(spelled)s", "%(derivation)s","%(form)s","%(gender)s","%(person)s","%(number)s","%(voice)s","%(state)s","%(aspect)s")""" % { "gid":gid, "word_gid":word_gid, "word_id":iteration["word_id"], "aya_id":iteration["aya_id"], "sura_id":iteration["sura_id"], "order":order, "token":IFEXIST( d, "token" ), "arabictoken":IFEXIST( d, "arabictoken" ), "prefixes":";".join([prefix["arabictoken"] for prefix in glob["prefixes"] ]).encode( "utf-8" ), "suffixes":";".join([suffix["arabictoken"] for suffix in glob["suffixes"] ]).encode( "utf-8" ), "type":IFEXIST( d, "type" ), "pos":IFEXIST( d, "pos" ), "arabicpos":IFEXIST( d, "arabicpos" ), "mood":IFEXIST( d, "mood" ), "arabicmood":IFEXIST( d, "arabicmood" ), "case":IFEXIST( d, "case" ), "arabiccase":IFEXIST( d, "arabiccase" ), "root":IFEXIST( d, "root" ), "arabicroot":IFEXIST( d, "arabicroot" ), "lemma":IFEXIST( d, "lemma" ), "arabiclemma":IFEXIST( d, "arabiclemma" ), "special":IFEXIST( d, "special" ), "arabicspecial":IFEXIST( d, "arabicspecial" ), "word":iteration["word"].encode( "utf-8" ), "normalised": QASF.normalize_all( iteration["word"] ).encode( "utf-8" ), "spelled": QASF_spelled.normalize_all( iteration["word"] ).encode( "utf-8" ), "derivation":IFEXIST( d, "derivation" ), "form":IFEXIST( d, "form" ), "gender":IFEXIST( d, "gender" ), "person":IFEXIST( d, "person" ), "number":IFEXIST( d, "number" ), "voice":IFEXIST( d, "voice" ), "state":IFEXIST( d, "state" ), "aspect":IFEXIST( d, "aspect" ) } word_gid += 1 if word_gid % 1000 == 0: print word_gid, print("\n") order = 0 for d in iteration["morphology"]["base"]: gid += 1 order += 1 cur.execute( QUERY( d, iteration["morphology"] ) ) print("OK") maindb.commit()