Example #1
0
    class SpellErrors( QMultiTerm ):
        """
        query that ignores  the spellerrors of arabic letters
            - ta' marbuta and ha'
            - alef maqsura and ya'
            - hamza formes
        """


        def __init__( self, fieldname, text, boost = 1.0 ):
            self.fieldname = fieldname
            self.text = text
            self.boost = boost
            self.words = [text]
            self.ASF = QArabicSymbolsFilter( shaping = True, tashkil = False, spellerrors = True, hamza = True )


        def _words( self, ixreader ):
            for field, indexed_text in ixreader.all_terms():
                if field == self.fieldname:
                    if self._compare( self.text, indexed_text ):
                        yield indexed_text

        def _compare( self, first, second ):
            """ normalize and compare """
            if first[:2] == u"مو": print first
            eqiv = ( self.ASF.normalize_all( first ) == self.ASF.normalize_all( second ) )
            if eqiv:
                self.words.append( second )
            return eqiv
Example #2
0
    class SpellErrors(QMultiTerm):
        """
        query that ignores  the spell errors of arabic letters such as:
            - ta' marbuta and ha'
            - alef maqsura and ya'
            - hamza forms
        """
        def __init__(self, fieldname, text, boost=1.0):
            self.fieldname = fieldname
            self.text = text
            self.boost = boost
            self.words = [text]
            self.ASF = QArabicSymbolsFilter(shaping=True,
                                            tashkil=False,
                                            spellerrors=True,
                                            hamza=True)

        def _words(self, ixreader):
            for field, indexed_text in ixreader.all_terms():
                if field == self.fieldname:
                    if self._compare(self.text, indexed_text):
                        yield indexed_text

        def _compare(self, first, second):
            """ normalize and compare """
            if first[:2] == u"مو": print first
            eqiv = (self.ASF.normalize_all(first) == self.ASF.normalize_all(
                second))
            if eqiv:
                self.words.append(second)
            return eqiv
Example #3
0
 def __init__(self, fieldname, text, boost=1.0):
     self.fieldname = fieldname
     self.text = text
     self.boost = boost
     ASF = QArabicSymbolsFilter(shaping=False,
                                tashkil=True,
                                spellerrors=False,
                                hamza=False)
     self.words = [ASF.normalize_all(word) for word in text]
Example #4
0
 def __init__(self, fieldname, text, boost=1.0):
     self.fieldname = fieldname
     self.text = text
     self.boost = boost
     self.words = [text]
     self.ASF = QArabicSymbolsFilter(shaping=True,
                                     tashkil=False,
                                     spellerrors=True,
                                     hamza=True)
        def __init__( self, fieldname, text, boost = 1.0 ):
            self.fieldname = fieldname
            self.text = text
            self.boost = boost
            ASF = QArabicSymbolsFilter( shaping = False,
									   	tashkil = True,
									   	spellerrors = False,
									   	hamza = False )
            self.words = [ASF.normalize_all( word ) for word in text]
Example #6
0
    def transfer_vocalizations(self):
        """ load indexed vocalized words  from the main index and save them as a list in a dynamic py """
        QSE = QuranicSearchEngine(self.__ixpath)

        if QSE.OK:
            mfw = QSE.most_frequent_words(9999999, "aya_")
        else:
            mfw = []

        V = QArabicSymbolsFilter( \
                                       shaping = False, \
                                       tashkil = True, \
                                       spellerrors = False, \
                                       hamza = False \
        ).normalize_all

        vocalization_dict = {}
        for w in mfw:
            word = w[1]
            if vocalization_dict.has_key(V(word)):
                vocalization_dict[V(word)].append(word)
            else:
                vocalization_dict[V(word)] = [word]

        raw_str = self.dheader + u"\nvocalization_dict=" + str(
            vocalization_dict).replace(",", ",\n")

        fich = open(self.__dypypath + "vocalizations_dyn.py", "w+")
        fich.write(raw_str)

        return raw_str
Example #7
0
    def make_spellerrors_dict(self):
        """ make the spell errors dictionary
        @deprecated: forget this!
        """

        D = QseDocIndex()
        R = QReader(D)
        nor = QArabicSymbolsFilter(True, True, True, True).normalize_all
        spell_err = {}
        for term in R.reader.all_terms():
            if term[0] in ["aya"]:
                normalized = nor(term[1])
                if spell_err.has_key(normalized):
                    spell_err[normalized].append(term[1])
                else:
                    spell_err[normalized] = [term[1]]

        print "\n".join([
            unicode(key) + u":" + ",".join(value)
            for key, value in spell_err.items()
        ])

        raw_str = self.dheader + u"\nspell_err=" + str(spell_err)

        fich = open(self.__dypypath + "spellerrors_dyn.py", "w+")
        fich.write(raw_str)
        def __init__( self, fieldname, text, boost = 1.0 ):
            self.fieldname = fieldname
            self.text = text
            self.boost = boost
            self.words = [text]
            self.ASF = QArabicSymbolsFilter( shaping = True,
											 tashkil = False,
											 spellerrors = True,
											 hamza = True )
Example #9
0
	def _search_word( self, flags ):
		"""
		return the results of word search as a dictionary data structure
		"""
		#flags
		query = flags["query"] if flags.has_key( "query" ) \
				else self._defaults["flags"]["query"]
		sortedby = flags["sortedby"] if flags.has_key( "sortedby" ) \
				   else self._defaults["flags"]["sortedby"]
		range = int( flags["perpage"] ) if  flags.has_key( "perpage" )  \
				else flags["range"] if flags.has_key( "range" ) \
									else self._defaults["flags"]["range"]
		## offset = (page-1) * perpage   --  mode paging
		offset = ( ( int( flags["page"] ) - 1 ) * range ) + 1 if flags.has_key( "page" ) \
				 else int( flags["offset"] ) if flags.has_key( "offset" ) \
					  else self._defaults["flags"]["offset"]
		romanization = flags["romanization"] if flags.has_key( "romanization" ) \
					  else self._defaults["flags"]["romanization"]
		highlight = flags["highlight"] if flags.has_key( "highlight" ) \
					else self._defaults["flags"]["highlight"]
		script = flags["script"] if flags.has_key( "script" ) \
				 else self._defaults["flags"]["script"]
		vocalized = TRUE_FALSE( flags["vocalized"] ) if flags.has_key( "vocalized" ) \
					else self._defaults["flags"]["vocalized"]
		view = flags["view"] if flags.has_key( "view" ) \
				else self._defaults["flags"]["view"]

		# pre-defined views
		if view == "minimal":
			vocalized = False
		elif view == "normal":
			pass
		elif view == "full":
			romanization = "iso"
		elif view == "statistic":
			pass
		elif view == "linguistic":
			romanization = "buckwalter"
		elif view == "recitation":
			script = "uthmani"
		else: # if view == custom or undefined
			pass

		#preprocess query
		query = query.replace( "\\", "" )
		if not isinstance( query, unicode ):
			query = unicode( query , 'utf8' )

		if ":" not in query:
			query = unicode( transliterate( "buckwalter", query, ignore = "'_\"%*?#~[]{}:>+-|" ) )


		#Search
		SE = self.WSE
		res, termz = SE.search_all( query  , self._defaults["results_limit"]["word"], sortedby = sortedby )
		terms = [term[1] for term in list( termz )[:self._defaults["maxkeywords"]]]

		#pagination
		offset = 1 if offset < 1 else offset;
		range = self._defaults["minrange"] if range < self._defaults["minrange"] else range;
		range = self._defaults["maxrange"] if range > self._defaults["maxrange"] else range;
		interval_end = offset + range - 1
		end = interval_end if interval_end < len( res ) else len( res )
		start = offset if offset <= len( res ) else -1
		reslist = [] if end == 0 or start == -1 else list( res )[start - 1:end]
		output = {}


		#if True:
		## strip vocalization when vocalized = true
		V = QArabicSymbolsFilter( \
								shaping = False, \
								tashkil = not vocalized, \
								spellerrors = False, \
								hamza = False \
								).normalize_all
		# highligh function that consider None value and non-definition
		H = lambda X:  SE.highlight( X, terms, highlight ) if highlight != "none" and X else X if X else u"-----"
		# Numbers are 0 if not defined
		N = lambda X:X if X else 0
		# parse keywords lists , used for Sura names
		kword = re.compile( u"[^,،]+" )
		keywords = lambda phrase: kword.findall( phrase )
		#####################################################
		extend_runtime = res.runtime
		# Words & Annotations
		words_output = {"individual":{}}
		if True:
			matches = 0
			docs = 0
			cpt = 1;
			for term in termz :
				if True: #term[0] == "normalized" or term[0] == "word":
					if term[2]:
						matches += term[2]
					docs += term[3]
					words_output[ "individual" ][ cpt ] = {
															 "field": term[0],
															 "word":term[1],
															 "romanization": transliterate( romanization, term[1], ignore = "" , reverse = True ) if romanization in self.DOMAINS["romanization"] else None,
															 "nb_matches":term[2],
															 "nb_docs":term[3],
														 }
					cpt += 1
			words_output["global"] = {"nb_words":cpt - 1, "nb_matches":matches}
		output["keywords"] = words_output;

		output["runtime"] = round( extend_runtime, 5 )
		output["interval"] = {
							"start":start,
							"end":end,
							"total": len( res ),
							"page": ( ( start - 1 ) / range ) + 1,
							"nb_pages": ( ( len( res ) - 1 ) / range ) + 1
							}

		### Words
		cpt = start - 1
		output["words"] = {}
		for r in reslist :
			cpt += 1
			output["words"][ cpt ] = {

					  "identifier": {
									 "gid":r["gid"],
									 "word_gid": r["word_gid"],
									 "aya_id":r["aya_id"],
									 "sura_id":r["sura_id"],
									 "word_id":r["word_id"],
									},

		              "word":{
		              		"text":  H( V( r["word"] ) ),
		                	"part": r["part"],
		                	"part_order": r["order"],
		                	"token": r["arabictoken"],
		                	"POS": {
										  	"english": r["pos"],
										  	"arabic": r["arabicpos"],
									},
							"mood": {
										"english": r["mood"],
										"arabic": r["arabicmood"],
									},
							"case": {
										"english": r["case"],
										"arabic": r["arabiccase"],
									},
							"root": {
										#"english": r["root"],
										"arabic": r["arabicroot"],
									},
							"lemma": {
										#"english": r["lemma"],
										"arabic": r["arabiclemma"],
									},

							"special": {
										#"english": r["special"],
										"arabic": r["arabicspecial"],
									},
							"derivation": r["derivation"],
							"form": r["form"],
							"gender": r["gender"],
							"person": r["person"],
							"number": r["number"],
							"voice": r["voice"],
							"state": r["state"],
							"aspect": r["aspect"],
		              },
		    		}
		return output
Example #10
0
	def _search_aya( self, flags ):
		"""
		return the results of aya search as a dictionary data structure
		"""
		#flags
		query = flags["query"] if flags.has_key( "query" ) \
				else self._defaults["flags"]["query"]
		sortedby = flags["sortedby"] if flags.has_key( "sortedby" ) \
				   else self._defaults["flags"]["sortedby"]
		range = int( flags["perpage"] ) if  flags.has_key( "perpage" )  \
				else flags["range"] if flags.has_key( "range" ) \
									else self._defaults["flags"]["range"]
		## offset = (page-1) * perpage   --  mode paging
		offset = ( ( int( flags["page"] ) - 1 ) * range ) + 1 if flags.has_key( "page" ) \
				 else int( flags["offset"] ) if flags.has_key( "offset" ) \
					  else self._defaults["flags"]["offset"]
		recitation = flags["recitation"] if flags.has_key( "recitation" ) \
					 else self._defaults["flags"]["recitation"]
		translation = flags["translation"] if flags.has_key( "translation" ) \
					  else self._defaults["flags"]["translation"]
		romanization = flags["romanization"] if flags.has_key( "romanization" ) \
					  else self._defaults["flags"]["romanization"]
		highlight = flags["highlight"] if flags.has_key( "highlight" ) \
					else self._defaults["flags"]["highlight"]
		script = flags["script"] if flags.has_key( "script" ) \
				 else self._defaults["flags"]["script"]
		vocalized = TRUE_FALSE( flags["vocalized"] ) if flags.has_key( "vocalized" ) \
					else self._defaults["flags"]["vocalized"]
		fuzzy = TRUE_FALSE( flags["fuzzy"] ) if flags.has_key( "fuzzy" ) \
				else self._defaults["flags"]["fuzzy"]
		view = flags["view"] if flags.has_key( "view" ) \
				else self._defaults["flags"]["view"]

		# pre-defined views
		if view == "minimal":
			#fuzzy = True
			#page = 25
			vocalized = False
			recitation = None
			translation = None
			prev_aya = next_aya = False
			sura_info = False
			word_info = False
			word_synonyms = False
			word_derivations = False
			word_vocalizations = False
			aya_position_info = aya_theme_info = aya_sajda_info = False
			aya_stat_info = False
			sura_stat_info = False
			annotation_aya = annotation_word = False
		elif view == "normal":
			prev_aya = next_aya = False
			sura_info = True
			word_info = True
			word_synonyms = False
			word_derivations = True
			word_vocalizations = True
			aya_position_info = aya_theme_info = aya_sajda_info = True
			aya_stat_info = True
			sura_stat_info = False
			annotation_aya = annotation_word = False
		elif view == "full":
			prev_aya = next_aya = True
			sura_info = True
			word_info = True
			word_synonyms = True
			word_derivations = True
			word_vocalizations = True
			aya_position_info = aya_theme_info = aya_sajda_info = True
			aya_stat_info = sura_stat_info = True
			annotation_aya = annotation_word = False
			romanization = "iso"
		elif view == "statistic":
			prev_aya = next_aya = False
			sura_info = True
			word_info = True
			word_synonyms = False
			word_derivations = True
			word_vocalizations = True
			aya_position_info = True
			aya_theme_info = aya_sajda_info = False
			aya_stat_info = True
			sura_stat_info = True
			annotation_aya = False
			annotation_word = False
		elif view == "linguistic":
			prev_aya = next_aya = False
			sura_info = False
			word_info = True
			word_synonyms = True
			word_derivations = True
			word_vocalizations = True
			aya_position_info = False
			aya_theme_info = aya_sajda_info = True
			aya_stat_info = False
			sura_stat_info = False
			annotation_aya = False
			annotation_word = False
			romanization = "buckwalter"
		elif view == "recitation":
			script = "uthmani"
			prev_aya = next_aya = True
			sura_info = True
			word_info = False
			word_synonyms = False
			word_derivations = False
			word_vocalizations = False
			aya_position_info = True
			aya_theme_info = False
			aya_sajda_info = True
			aya_stat_info = False
			sura_stat_info = False
			annotation_aya = False
			annotation_word = False
		else: # if view == custom or undefined
			prev_aya = TRUE_FALSE( flags["prev_aya"] ) if flags.has_key( "prev_aya" ) \
						else self._defaults["flags"]["prev_aya"]
			next_aya = TRUE_FALSE( flags["next_aya"] ) if flags.has_key( "next_aya" ) \
						else self._defaults["flags"]["next_aya"]
			sura_info = TRUE_FALSE( flags["sura_info"] ) if flags.has_key( "sura_info" ) \
						else self._defaults["flags"]["sura_info"]
			sura_stat_info = TRUE_FALSE( flags["sura_stat_info"] ) if flags.has_key( "sura_stat_info" ) \
						else self._defaults["flags"]["sura_stat_info"]
			word_info = TRUE_FALSE( flags["word_info"] ) if flags.has_key( "word_info" ) \
						else self._defaults["flags"]["word_info"]
			word_synonyms = TRUE_FALSE( flags["word_synonyms"] ) if flags.has_key( "word_synonyms" ) \
						else self._defaults["flags"]["word_synonyms"]
			word_derivations = TRUE_FALSE( flags["word_derivations"] ) if flags.has_key( "word_derivations" ) \
						else self._defaults["flags"]["word_derivations"]
			word_vocalizations = TRUE_FALSE( flags["word_vocalizations"] ) if flags.has_key( "word_vocalizations" ) \
						else self._defaults["flags"]["word_vocalizations"]

			aya_position_info = TRUE_FALSE( flags["aya_position_info"] ) if flags.has_key( "aya_position_info" ) \
								else self._defaults["flags"]["aya_position_info"]
			aya_theme_info = TRUE_FALSE( flags["aya_theme_info"] ) if flags.has_key( "aya_theme_info" ) \
							 else self._defaults["flags"]["aya_theme_info"]
			aya_stat_info = TRUE_FALSE( flags["aya_stat_info"] ) if flags.has_key( "aya_stat_info" ) \
							else self._defaults["flags"]["aya_stat_info"]
			aya_sajda_info = TRUE_FALSE( flags["aya_sajda_info"] ) if flags.has_key( "aya_sajda_info" ) \
							 else self._defaults["flags"]["aya_sajda_info"]
			annotation_aya = TRUE_FALSE( flags["annotation_aya"] ) if flags.has_key( "annotation_aya" ) \
							 else self._defaults["flags"]["annotation_aya"]
			annotation_word = TRUE_FALSE( flags["annotation_word"] ) if flags.has_key( "annotation_word" ) \
							 else self._defaults["flags"]["annotation_word"]

		#print query
		#preprocess query
		query = query.replace( "\\", "" )
		if not isinstance( query, unicode ):
			query = unicode( query , 'utf8' )

		if ":" not in query:
			query = unicode( transliterate( "buckwalter", query, ignore = "'_\"%*?#~[]{}:>+-|" ) )


		#Search
		SE = self.FQSE if fuzzy else self.QSE
		res, termz = SE.search_all( query  , self._defaults["results_limit"]["aya"], sortedby = sortedby )
		terms = [term[1] for term in list( termz )[:self._defaults["maxkeywords"]]]
		terms_uthmani = map( STANDARD2UTHMANI, terms )
		#pagination
		offset = 1 if offset < 1 else offset;
		range = self._defaults["minrange"] if range < self._defaults["minrange"] else range;
		range = self._defaults["maxrange"] if range > self._defaults["maxrange"] else range;
		interval_end = offset + range - 1
		end = interval_end if interval_end < len( res ) else len( res )
		start = offset if offset <= len( res ) else -1
		reslist = [] if end == 0 or start == -1 else list( res )[start - 1:end]
		output = {}

		## disable annotations for aya words if there is more then one result
		if annotation_aya and len ( res ) > 1:
			annotation_aya = False

		#if True:
		## strip vocalization when vocalized = true
		V = QArabicSymbolsFilter( \
								shaping = False, \
								tashkil = not vocalized, \
								spellerrors = False, \
								hamza = False \
								).normalize_all
		strip_vocalization = QArabicSymbolsFilter( \
								shaping = False, \
								tashkil = True, \
								spellerrors = False, \
								hamza = False \
								).normalize_all
		# highligh function that consider None value and non-definition
		H = lambda X:  self.QSE.highlight( X, terms, highlight ) if highlight != "none" and X else X if X else u"-----"
		# Numbers are 0 if not defined
		N = lambda X:X if X else 0
		# parse keywords lists , used for Sura names
		kword = re.compile( u"[^,،]+" )
		keywords = lambda phrase: kword.findall( phrase )
		##########################################
		extend_runtime = res.runtime
		# Words & Annotations
		words_output = {"individual":{}}
		if word_info:
			matches = 0
			docs = 0
			nb_vocalizations_globale = 0
			cpt = 1;
			annotation_word_query = u"( 0 "
			for term in termz :
				if term[0] == "aya" or term[0] == "aya_":
					if term[2]:
						matches += term[2]
					docs += term[3]
					if term[0] == "aya_":
						annotation_word_query += u" OR word:%s " % term[1]
					else: #if aya
						annotation_word_query += u" OR normalized:%s " % STANDARD2UTHMANI( term[1] )
					if word_vocalizations:
						vocalizations = vocalization_dict[ strip_vocalization( term[1] ) ] if vocalization_dict.has_key( strip_vocalization( term[1] ) ) \
										   else []
						nb_vocalizations_globale += len( vocalizations )
					if word_synonyms:
						synonyms = syndict[term[1]] if syndict.has_key( term[1] ) \
										   else []
					if word_derivations:
						lemma = LOCATE( derivedict["word_"], derivedict["lemma"], term[1] )
						root = LOCATE( derivedict["word_"], derivedict["root"], term[1] )
						if lemma:  # if different of none
							derivations = FILTER_DOUBLES( FIND( derivedict["lemma"], derivedict["word_"], lemma ) )
						else:
							derivations = []

					words_output[ "individual" ][ cpt ] = {
															 "word":term[1],
															 "romanization": transliterate( romanization, term[1], ignore = "" , reverse = True ) if romanization in self.DOMAINS["romanization"] else None,
															 "nb_matches":term[2],
															 "nb_ayas":term[3],
															 "nb_vocalizations": len( vocalizations ) if word_vocalizations else 0,#unneeded
															 "vocalizations": vocalizations if word_vocalizations else [],
															 "nb_synonyms": len( synonyms ) if word_synonyms else 0,#unneeded
															 "synonyms": synonyms if word_synonyms else [],
															 "lemma": lemma if word_derivations else "",
															 "root": root if word_derivations else "",
															 "nb_derivations": len( derivations ) if word_derivations else 0, #unneeded
															 "derivations": derivations if word_derivations else []
														 }
					cpt += 1
			annotation_word_query += u" ) "
			words_output["global"] = {"nb_words":cpt - 1, "nb_matches":matches, "nb_vocalizations": nb_vocalizations_globale}
		output["words"] = words_output;
		#Magic_loop to built queries of Adjacents,translations and annotations in the same time
		if prev_aya or next_aya or translation or  annotation_aya:
			adja_query = trad_query = annotation_aya_query = u"( 0"

			for r in reslist :
				if prev_aya: adja_query += u" OR gid:%s " % unicode( r["gid"] - 1 )
				if next_aya: adja_query += u" OR gid:%s " % unicode( r["gid"] + 1 )
				if translation: trad_query += u" OR gid:%s " % unicode( r["gid"] )
				if annotation_aya: annotation_aya_query += u" OR  ( aya_id:%s AND  sura_id:%s ) " % ( unicode( r["aya_id"] ) , unicode( r["sura_id"] ) )

			adja_query += u" )"
			trad_query += u" )" + u" AND id:%s " % unicode( translation )
			annotation_aya_query += u" )"


		# Adjacents
		if prev_aya or next_aya:
			adja_res = self.QSE.find_extended( adja_query, "gid" )
			adja_ayas = {0:{"aya_":u"----", "uth_":u"----", "sura":u"---", "aya_id":0}, 6237:{"aya_":u"----", "uth_":u"----", "sura":u"---", "aya_id":9999}}
			for adja in adja_res:
				adja_ayas[adja["gid"]] = {"aya_":adja["aya_"], "uth_":adja["uth_"], "aya_id":adja["aya_id"], "sura":adja["sura"]}
				extend_runtime += adja_res.runtime

		#translations
		if translation:
			trad_res = self.TSE.find_extended( trad_query, "gid" )
			extend_runtime += trad_res.runtime
			trad_text = {}
			for tr in trad_res:
				trad_text[tr["gid"]] = tr["text"]

		#annotations for aya words
		if annotation_aya or ( annotation_word and word_info ) :
			annotation_word_query = annotation_word_query if annotation_word and word_info else u"()"
			annotation_aya_query = annotation_aya_query if annotation_aya else u"()"
			annotation_query = annotation_aya_query + u" OR  " + annotation_word_query
			#print annotation_query.encode( "utf-8" )
			annot_res = self.WSE.find_extended( annotation_query, "gid" )
			extend_runtime += annot_res.runtime
			## prepare annotations for use
			annotations_by_word = {}
			annotations_by_position = {}
			for annot in annot_res:
				if ( annotation_word and word_info ) :
					if annot["normalized"] in terms_uthmani:
						if annotations_by_word.has_key( annot["normalized"] ):
							if annotations_by_word[annot["normalized"]].has_key( annot["word"] ):
								annotations_by_word[annot["normalized"]][annot["word"]][annot["order"]] = annot;
							else:
								annotations_by_word[annot["normalized"]][annot["word"]] = { annot["order"]: annot} ;
						else:
							annotations_by_word[annot["normalized"]] = { annot["word"]: { annot["order"]: annot}}
				if annotation_aya:
					if annotations_by_position.has_key( ( annot["sura_id"], annot["aya_id"] ) ):
						annotations_by_position[( annot["sura_id"], annot["aya_id"] )][annot["word_id"]] = annot
					else:
						annotations_by_position[( annot["sura_id"], annot["aya_id"] )] = { annot["word_id"]: annot }

		## merge word annotations to word output
		if ( annotation_word and word_info ):
			for cpt in xrange( 1, len( output["words"]["individual"] ) + 1 ):
				current_word = STANDARD2UTHMANI( output["words"]["individual"][cpt]["word"] )
				#print current_word.encode( "utf-8" ), "=>", annotations_by_word, "=>", list( annot_res )
				if annotations_by_word.has_key( current_word ):
					current_word_annotations = annotations_by_word[ current_word ]
					output["words"]["individual"][cpt]["annotations"] = current_word_annotations
					output["words"]["individual"][cpt]["nb_annotations"] = len ( current_word_annotations )

		output["runtime"] = round( extend_runtime, 5 )
		output["interval"] = {
							"start":start,
							"end":end,
							"total": len( res ),
							"page": ( ( start - 1 ) / range ) + 1,
							"nb_pages": ( ( len( res ) - 1 ) / range ) + 1
							}
		output["translation_info"] = {}
		### Ayas
		cpt = start - 1
		output["ayas"] = {}
		for r in reslist :
			cpt += 1
			output["ayas"][ cpt ] = {

					  "identifier": {"gid":r["gid"],
									 "aya_id":r["aya_id"],
									 "sura_id":r["sura_id"],
									 "sura_name":keywords( r["sura"] )[0],
									},

		              "aya":{
		              		"id":r["aya_id"],
		              		"text":   H( V( r["aya_"] ) )  if script == "standard"
		              			else   H( r["uth_"] ) ,
                            "text_no_highlight": V( r["aya_"] )   if script == "standard"
                                  else   r["uth_"],
						"translation": trad_text[r["gid"]] if ( translation != "None" and translation and trad_text.has_key( r["gid"] ) ) else None,
		                	"recitation": None if not recitation or not self._recitations.has_key( recitation ) \
		                				  else u"http://www.everyayah.com/data/" + self._recitations[recitation]["subfolder"].encode( "utf-8" ) + "/%03d%03d.mp3" % ( r["sura_id"], r["aya_id"] ),
		                	"prev_aya":{
						    "id":adja_ayas[r["gid"] - 1]["aya_id"],
						    "sura":adja_ayas[r["gid"] - 1]["sura"],
						    "text": V( adja_ayas[r["gid"] - 1]["aya_"] )  if script == "standard"
		              			else  adja_ayas[r["gid"] - 1]["uth_"] ,
						    } if prev_aya else None
						    ,
		                	"next_aya":{
						    "id":adja_ayas[r["gid"] + 1]["aya_id"],
						    "sura":adja_ayas[r["gid"] + 1]["sura"],
						    "text":  V( adja_ayas[r["gid"] + 1]["aya_"] )  if script == "standard"
		              			else   adja_ayas[r["gid"] + 1]["uth_"] ,
						    } if next_aya else None
						    ,

		              },

		    		"sura": {} if not sura_info
					  else  {
						  "name":keywords( r["sura"] )[0] ,
							  "id":r["sura_id"],
							  "type": r["sura_type"] ,
							  "order":r["sura_order"],
							  "ayas":r["s_a"],
						    "stat":{} if not sura_stat_info
							  	  else	{
										  "words":N( r["s_w"] ),
										  "godnames":N( r["s_g"] ),
										  "letters":N( r["s_l"] )
								      }

		    		},

		                "position": {} if not aya_position_info
		                else {
		                	"manzil":r["manzil"],
		                	"juz":r["juz"],
		                	"hizb":r["hizb"],
		                	"rub":r["rub"] % 4,
		                	"page":r["page"],
		                	"page_IN":r["page_IN"],
		                	"ruku":r["ruku"],
		           	},

		           	"theme":{} if not aya_theme_info
		                else	{
				    		"chapter": r["chapter"],
				    		"topic":  r["topic"] ,
				   		 "subtopic": r["subtopic"]
				 	   },

				"stat":  {} if not aya_stat_info
		                else {
						"words":N( r["a_w"] ),
		    				"letters":N( r["a_l"] ),
		    				"godnames":N( r["a_g"] )
				}       ,

				"sajda":{} if not aya_sajda_info
		                else    {
		    				"exist":( r["sajda"] == u"نعم" ),
		    				"type": r["sajda_type"]  if ( r["sajda"] == u"نعم" ) else None,
		    				"id":N( r["sajda_id"] ) if ( r["sajda"] == u"نعم" ) else None,
		    			},

				"annotations": {} if not annotation_aya or not annotations_by_position.has_key( ( r["sura_id"], r["aya_id"] ) )
							else annotations_by_position[( r["sura_id"], r["aya_id"] )]
		    		}
		return output
Example #11
0
    def _search(self, flags):
        """
		return the results of search as json
	    """
        #flags
        query = flags["query"] if flags.has_key(
            "query") else self._defaults["flags"]["query"]
        sortedby = flags["sortedby"] if flags.has_key(
            "sortedby") else self._defaults["flags"]["sortedby"]
        range = int(flags["perpage"]) if flags.has_key(
            "perpage") else flags["range"] if flags.has_key(
                "range") else self._defaults["flags"]["range"]
        offset = ((int(flags["page"]) - 1) * range) + 1 if flags.has_key(
            "page") else int(flags["offset"]) if flags.has_key(
                "offset") else self._defaults["flags"][
                    "offset"]  ## offset = (page-1) * perpage   --  mode paging
        highlight = flags["highlight"] if flags.has_key(
            "highlight") else self._defaults["flags"]["highlight"]
        script = flags["script"] if flags.has_key(
            "script") else self._defaults["flags"]["script"]
        vocalized = flags["vocalized"] if flags.has_key(
            "vocalized") else self._defaults["flags"]["vocalized"]
        recitation = flags["recitation"] if flags.has_key(
            "recitation") else self._defaults["flags"]["recitation"]
        translation = flags["translation"] if flags.has_key(
            "translation") else self._defaults["flags"]["translation"]
        prev_aya = flags["prev_aya"] if flags.has_key(
            "prev_aya") else self._defaults["flags"]["prev_aya"]
        next_aya = flags["next_aya"] if flags.has_key(
            "next_aya") else self._defaults["flags"]["next_aya"]
        sura_info = flags["sura_info"] if flags.has_key(
            "sura_info") else self._defaults["flags"]["sura_info"]
        word_info = flags["word_info"] if flags.has_key(
            "word_info") else self._defaults["flags"]["word_info"]
        aya_position_info = flags["aya_position_info"] if flags.has_key(
            "aya_position_info"
        ) else self._defaults["flags"]["aya_position_info"]
        aya_theme_info = flags["aya_theme_info"] if flags.has_key(
            "aya_theme_info") else self._defaults["flags"]["aya_theme_info"]
        aya_stat_info = flags["aya_stat_info"] if flags.has_key(
            "aya_stat_info") else self._defaults["flags"]["aya_stat_info"]
        aya_sajda_info = flags["aya_sajda_info"] if flags.has_key(
            "aya_sajda_info") else self._defaults["flags"]["aya_sajda_info"]
        annotation_aya = flags["annotation_aya"] if flags.has_key(
            "annotation_aya") else self._defaults["flags"]["annotation_aya"]
        annotation_word = flags["annotation_word"] if flags.has_key(
            "annotation_word") else self._defaults["flags"]["annotation_word"]
        fuzzy = flags["fuzzy"] if flags.has_key(
            "fuzzy") else self._defaults["flags"]["fuzzy"]

        #Search
        SE = self.FQSE if fuzzy else self.QSE
        res, termz = SE.search_all(unicode(query.replace("\\", ""), 'utf8'),
                                   self._defaults["results_limit"],
                                   sortedby=sortedby)
        terms = [term[1] for term in list(termz)
                 ]  # TODO: I dont like this termz structure , must change it
        terms_uthmani = map(STANDARD2UTHMANI, terms)
        #pagination
        offset = 1 if offset < 1 else offset
        range = self._defaults[
            "maxrange"] if range > self._defaults["maxrange"] else range
        interval_end = offset + range
        end = interval_end if interval_end < len(res) else len(res)
        start = offset if offset <= len(res) else -1
        reslist = [] if end == 0 or start == -1 else list(res)[start - 1:end]
        output = {}

        #if True:
        ## strip vocalization when vocalized = true
        V = QArabicSymbolsFilter( \
              shaping = False, \
              tashkil = not vocalized, \
              spellerrors = False, \
              hamza = False \
              ).normalize_all
        # highligh function that consider None value and non-definition
        H = lambda X: self.QSE.highlight(
            X, terms, highlight
        ) if highlight != "none" and X else X if X else u"-----"
        # Numbers are 0 if not defined
        N = lambda X: X if X else 0
        # parse keywords lists , used for Sura names
        kword = re.compile(u"[^,،]+")
        keywords = lambda phrase: kword.findall(phrase)
        # Tamdid devine name to avoid double Shedda on the middle Lam
        Gword_tamdid = lambda aya: aya.replace(u"لَّه", u"لَّـه").replace(
            u"لَّه", u"لَّـه")
        ##########################################
        extend_runtime = res.runtime
        # Words & Annotations
        words_output = {}
        if word_info:
            matches = 0
            docs = 0
            nb_vocalizations_globale = 0
            cpt = 1
            annotation_word_query = u"( 0 "
            for term in termz:
                if term[0] == "aya":
                    if term[2]:
                        matches += term[2]
                    docs += term[3]
                    annotation_word_query += u" OR normalized:%s " % STANDARD2UTHMANI(
                        term[1])
                    vocalizations = vocalization_dict[term[1]]
                    nb_vocalizations_globale += len(vocalizations)
                    words_output[cpt] = {
                        "word": term[1],
                        "nb_matches": term[2],
                        "nb_ayas": term[3],
                        "nb_vocalizations": len(vocalizations),
                        "vocalizations": vocalizations
                    }
                    cpt += 1
            annotation_word_query += u" ) "
            words_output["global"] = {
                "nb_words": cpt - 1,
                "nb_matches": matches,
                "nb_vocalizations": nb_vocalizations_globale
            }
        output["words"] = words_output

        #Magic_loop to built queries of Adjacents,translations and annotations in the same time
        if prev_aya or next_aya or translation or annotation_aya:
            adja_query = trad_query = annotation_aya_query = u"( 0"

            for r in reslist:
                if prev_aya:
                    adja_query += u" OR gid:%s " % unicode(r["gid"] - 1)
                if next_aya:
                    adja_query += u" OR gid:%s " % unicode(r["gid"] + 1)
                if translation:
                    trad_query += u" OR gid:%s " % unicode(r["gid"])
                if annotation_aya:
                    annotation_aya_query += u" OR  ( aya_id:%s AND  sura_id:%s ) " % (
                        unicode(r["aya_id"]), unicode(r["sura_id"]))

            adja_query += u" )"
            trad_query += u" )" + u" AND id:%s " % unicode(translation)
            annotation_aya_query += u" )"

        # Adjacents
        if prev_aya or next_aya:
            adja_res = self.QSE.find_extended(adja_query, "gid")
            adja_ayas = {
                0: {
                    "aya_": u"----",
                    "uth_": u"----",
                    "sura": u"---",
                    "aya_id": 0
                },
                6237: {
                    "aya_": u"----",
                    "uth_": u"----",
                    "sura": u"---",
                    "aya_id": 9999
                }
            }
            for adja in adja_res:
                adja_ayas[adja["gid"]] = {
                    "aya_": adja["aya_"],
                    "uth_": adja["uth_"],
                    "aya_id": adja["aya_id"],
                    "sura": adja["sura"]
                }
                extend_runtime += adja_res.runtime

        #translations
        if translation:
            trad_res = self.TSE.find_extended(trad_query, "gid")
            extend_runtime += trad_res.runtime
            trad_text = {}
            for tr in trad_res:
                trad_text[tr["gid"]] = tr["text"]

        #annotations for aya words
        if annotation_aya or (annotation_word and word_info):
            annotation_word_query = annotation_word_query if annotation_word and word_info else u"()"
            annotation_aya_query = annotation_aya_query if annotation_aya else u"()"
            annotation_query = annotation_aya_query + u" OR  " + annotation_word_query
            #print annotation_query.encode( "utf-8" )
            annot_res = self.WSE.find_extended(annotation_query, "gid")
            extend_runtime += annot_res.runtime
            ## prepare annotations for use
            annotations_by_word = {}
            annotations_by_position = {}
            for annot in annot_res:
                if (annotation_word and word_info):
                    if annot["normalized"] in terms_uthmani:
                        if annotations_by_word.has_key(annot["normalized"]):
                            annotations_by_word[annot["normalized"]][
                                annot["word"]] = annot
                        else:
                            annotations_by_word[annot["normalized"]] = {
                                annot["word"]: annot
                            }
                if annotation_aya:
                    if annotations_by_position.has_key(
                        (annot["sura_id"], annot["aya_id"])):
                        annotations_by_position[(
                            annot["sura_id"],
                            annot["aya_id"])][annot["word_id"]] = annot
                    else:
                        annotations_by_position[(annot["sura_id"],
                                                 annot["aya_id"])] = {
                                                     annot["word_id"]: annot
                                                 }

        ## merge word annotations to word output
        if (annotation_word and word_info):
            for cpt in xrange(1, len(termz) + 1):
                current_word = STANDARD2UTHMANI(output["words"][cpt]["word"])
                #print current_word.encode( "utf-8" ), "=>", annotations_by_word, "=>", list( annot_res )
                if annotations_by_word.has_key(current_word):
                    current_word_annotations = annotations_by_word[
                        current_word]
                    output["words"][cpt][
                        "annotations"] = current_word_annotations
                    output["words"][cpt]["nb_annotations"] = len(
                        current_word_annotations)

        output["runtime"] = extend_runtime
        output["interval"] = {"start": start, "end": end, "total": len(res)}
        output["translation_info"] = {}
        ### Ayas
        cpt = start - 1
        output["ayas"] = {}
        for r in reslist:
            cpt += 1
            output["ayas"][cpt] = {
                "identifier": {
                    "gid": r["gid"],
                    "aya_id": r["aya_id"],
                    "sura_id": r["sura_id"],
                    "sura_name": keywords(r["sura"])[0],
                },
                "aya": {
                    "id":
                    r["aya_id"],
                    "text":
                    Gword_tamdid(H(V(r["aya_"])))
                    if script == "standard" else Gword_tamdid(H(r["uth_"])),
                    "translation":
                    trad_text[r["gid"]] if
                    (translation != "None" and translation
                     and trad_text.has_key(r["gid"])) else None,
                    "recitation":
                    None
                    if not recitation else u"http://www.everyayah.com/data/" +
                    self._recitations[recitation]["subfolder"].encode("utf-8")
                    + "/%03d%03d.mp3" % (r["sura_id"], r["aya_id"]),
                    "prev_aya": {
                        "id":
                        adja_ayas[r["gid"] - 1]["aya_id"],
                        "sura":
                        adja_ayas[r["gid"] - 1]["sura"],
                        "text":
                        Gword_tamdid(V(adja_ayas[r["gid"] -
                                                 1]["aya_"])) if script
                        == "standard" else Gword_tamdid(adja_ayas[r["gid"] -
                                                                  1]["uth_"]),
                    } if prev_aya else None,
                    "next_aya": {
                        "id":
                        adja_ayas[r["gid"] + 1]["aya_id"],
                        "sura":
                        adja_ayas[r["gid"] + 1]["sura"],
                        "text":
                        Gword_tamdid(V(adja_ayas[r["gid"] +
                                                 1]["aya_"])) if script
                        == "standard" else Gword_tamdid(adja_ayas[r["gid"] +
                                                                  1]["uth_"]),
                    } if next_aya else None,
                },
                "sura": {} if not sura_info else {
                    "name": keywords(r["sura"])[0],
                    "id": r["sura_id"],
                    "type": r["sura_type"],
                    "order": r["sura_order"],
                    "stat": {
                        "ayas": r["s_a"],
                        "words": N(r["s_w"]),
                        "godnames": N(r["s_g"]),
                        "letters": N(r["s_l"])
                    }
                },
                "position": {} if not aya_position_info else {
                    "manzil": r["manzil"],
                    "hizb": r["hizb"],
                    "rub": r["rub"] % 4,
                    "page": r["page"],
                    "ruku": r["ruku"],
                },
                "theme": {} if not aya_theme_info else {
                    "chapter": r["chapter"],
                    "topic": r["topic"],
                    "subtopic": r["subtopic"]
                },
                "stat": {} if not aya_stat_info else {
                    "words": N(r["a_w"]),
                    "letters": N(r["a_l"]),
                    "godnames": N(r["a_g"])
                },
                "sajda": {} if not aya_sajda_info else {
                    "exist": (r["sajda"] == u"نعم"),
                    "type": r["sajda_type"] if
                    (r["sajda"] == u"نعم") else None,
                    "id": N(r["sajda_id"]) if (r["sajda"] == u"نعم") else None,
                },
                "annotations": {}
                if not annotation_aya or not annotations_by_position.has_key(
                    (r["sura_id"], r["aya_id"])) else
                annotations_by_position[(r["sura_id"], r["aya_id"])]
            }

        return {"search": output}
Example #12
0
# coding: utf-8
"""
This is a test module for alfanous.TextProcessing

"""

from alfanous.TextProcessing import QArabicSymbolsFilter, unicode_

if __name__ == "__main__":
    ASF = QArabicSymbolsFilter()
    TEXT = u"عاصِمٌ"
    TEXT = ASF.normalize_all(TEXT)
    print TEXT

    WORD1 = unicode_(u"عَاصِمُ")
    WORD2 = unicode_(u"عَاصمُ")
    LIST_HARAKAT1 = WORD1.list_harakat()
    LIST_HARAKAT2 = WORD2.list_harakat()
    WORD3 = unicode_(u"فاعل")
    PHRASE = unicode_(u"كانَ")
    print WORD3.apply_harakat_list(LIST_HARAKAT1)
    print LIST_HARAKAT1, "\n", LIST_HARAKAT2
    print unicode_.compare_harakat(LIST_HARAKAT1, LIST_HARAKAT2)
    print WORD1.shakl_compare(WORD1, WORD2)
    for i in PHRASE.tokenize_shakl():
        print i,

    WORD4 = unicode_(u"عاصم")
    WORD5 = unicode_(u"عاصِم")

    print WORD4 == WORD5
Example #13
0
# coding: utf-8

"""
This is a test module for alfanous.TextProcessing

"""

from alfanous.TextProcessing import QArabicSymbolsFilter, unicode_


if __name__ == "__main__":
    ASF = QArabicSymbolsFilter()
    TEXT = u"عاصِمٌ"
    TEXT = ASF.normalize_all( TEXT )
    print TEXT

    WORD1 = unicode_( u"عَاصِمُ" )
    WORD2 = unicode_( u"عَاصمُ" )
    LIST_HARAKAT1 = WORD1.list_harakat()
    LIST_HARAKAT2 = WORD2.list_harakat()
    WORD3 = unicode_( u"فاعل" )
    PHRASE = unicode_( u"كانَ" )
    print WORD3.apply_harakat_list( LIST_HARAKAT1 )
    print LIST_HARAKAT1, "\n", LIST_HARAKAT2
    print unicode_.compare_harakat( LIST_HARAKAT1, LIST_HARAKAT2 )
    print WORD1.shakl_compare( WORD1, WORD2 )
    for i in PHRASE.tokenize_shakl():
        print i,
    
    WORD4 = unicode_( u"عاصم" )
    WORD5 = unicode_( u"عاصِم" )
Example #14
0
    def __init__( self, QC_PATH = "../../store/quranic-corpus-morpology.xml", DB = "main.db" ):
        """ make word table """

        import sqlite3

        print "connecting to database ...",
        maindb = sqlite3.connect( DB )
        cur = maindb.cursor()
        print "OK"

        print "creating tables:"
	cur.execute( """ drop table if exists wordQC""" )
        cur.execute( 
                        """ create table if not exists  wordQC(
                        gid int unique,
                        word_gid int,
                        word_id int,
                        aya_id int,
                        sura_id int,

                        word varchar(25),
                        normalised varchar(25),
                        spelled varchar(25),
                        'order' int,
                        token varchar(25),
                        arabictoken varchar(25),
                        prefixes varchar(25),
                        suffixes varchar(25),


                        pos varchar(25),
                        type varchar(25),
                        arabicpos varchar(25),
                        mood varchar(25),
                        arabicmood varchar(25),
                        'case' varchar(25),
                        arabiccase varchar(25),
                        root varchar(25),
                        arabicroot varchar(25),
                        lemma varchar(25),
                        arabiclemma varchar(25),
                        special varchar(25),
                        arabicspecial varchar(25),

                        derivation varchar(25),
                        form varchar(25),
                        gender varchar(25),
                        person varchar(25),
                        number varchar(25),
                        voice varchar(25),
                        state varchar(25),
                        aspect varchar(25),

                        primary key(gid)

                    )

                    """ )
        print ">wordQC table ... OK"


        print ">loading Qurany Corpus...",
        from PyCorpus.QuranyCorpus import API as QC
        A = QC( source = QC_PATH )
        print ".OK\n"
        IFEXIST = lambda d, attrib: d[attrib].encode( "utf-8" ) if attrib in d else ""
        gid, word_gid = 0, 0
        print ">inserting values of gid...",
        for iteration in A.all_words_generator():
            QASF = QArabicSymbolsFilter( shaping = True, 
                                         tashkil = True, 
                                         spellerrors = False, 
                                         hamza = False, 
                                         uthmani_symbols = True )
            QASF_spelled = QArabicSymbolsFilter( shaping = True, 
                                                 tashkil = True, 
                                                 spellerrors = True, 
                                                 hamza = True, 
                                                 uthmani_symbols = True
                                                 )

            QUERY = lambda d, glob: """insert into wordQC(gid,word_gid,word_id,aya_id,sura_id,'order',token,arabictoken,prefixes, suffixes,type,pos,arabicpos,mood,
                arabicmood, 'case', arabiccase, root ,arabicroot, lemma ,arabiclemma, special, arabicspecial,
                word,normalised,spelled, derivation, form ,gender, person, number,voice, state, aspect) values
                ("%(gid)d","%(word_gid)d","%(word_id)d","%(aya_id)d","%(sura_id)d","%(order)d","%(token)s","%(arabictoken)s", "%(prefixes)s", "%(suffixes)s",  "%(type)s","%(pos)s","%(arabicpos)s","%(mood)s","%(arabicmood)s",
                "%(case)s","%(arabiccase)s","%(root)s","%(arabicroot)s","%(lemma)s","%(arabiclemma)s","%(special)s","%(arabicspecial)s","%(word)s","%(normalised)s","%(spelled)s",
                "%(derivation)s","%(form)s","%(gender)s","%(person)s","%(number)s","%(voice)s","%(state)s","%(aspect)s")""" % {
										    "gid":gid,
										    "word_gid":word_gid,
										    "word_id":iteration["word_id"],
										    "aya_id":iteration["aya_id"],
										    "sura_id":iteration["sura_id"],
										    "order":order,
										    "token":IFEXIST( d, "token" ),
										    "arabictoken":IFEXIST( d, "arabictoken" ),
										    "prefixes":";".join([prefix["arabictoken"] for prefix in glob["prefixes"] ]).encode( "utf-8" ),
										    "suffixes":";".join([suffix["arabictoken"] for suffix in glob["suffixes"] ]).encode( "utf-8" ),
										    "type":IFEXIST( d, "type" ),
										    "pos":IFEXIST( d, "pos" ),
										    "arabicpos":IFEXIST( d, "arabicpos" ),
										    "mood":IFEXIST( d, "mood" ),
										    "arabicmood":IFEXIST( d, "arabicmood" ),
										    "case":IFEXIST( d, "case" ),
										    "arabiccase":IFEXIST( d, "arabiccase" ),
										    "root":IFEXIST( d, "root" ),
										    "arabicroot":IFEXIST( d, "arabicroot" ),
										    "lemma":IFEXIST( d, "lemma" ),
										    "arabiclemma":IFEXIST( d, "arabiclemma" ),
										    "special":IFEXIST( d, "special" ),
										    "arabicspecial":IFEXIST( d, "arabicspecial" ),
										    "word":iteration["word"].encode( "utf-8" ),
										    "normalised":  QASF.normalize_all( iteration["word"] ).encode( "utf-8" ),
										    "spelled": QASF_spelled.normalize_all( iteration["word"] ).encode( "utf-8" ),
										    "derivation":IFEXIST( d, "derivation" ),
										    "form":IFEXIST( d, "form" ),
										    "gender":IFEXIST( d, "gender" ),
										    "person":IFEXIST( d, "person" ),
										    "number":IFEXIST( d, "number" ),
										    "voice":IFEXIST( d, "voice" ),
										    "state":IFEXIST( d, "state" ),
										    "aspect":IFEXIST( d, "aspect" )
										    }
            word_gid += 1
            if word_gid % 1000 == 0:
                print word_gid,
            print("\n")

            order = 0
            for d in iteration["morphology"]["base"]:
                gid += 1
                order += 1
                cur.execute( QUERY( d, iteration["morphology"] ) )

        print("OK")
        maindb.commit()