def transfer_vocalizations(self): """ load indexed vocalized words from the main index and save them as a list in a dynamic py """ QSE = QuranicSearchEngine(self.__ixpath) if QSE.OK: mfw = QSE.most_frequent_words(9999999, "aya_") else: mfw = [] V = QArabicSymbolsFilter( \ shaping = False, \ tashkil = True, \ spellerrors = False, \ hamza = False \ ).normalize_all vocalization_dict = {} for w in mfw: word = w[1] if vocalization_dict.has_key(V(word)): vocalization_dict[V(word)].append(word) else: vocalization_dict[V(word)] = [word] raw_str = self.dheader + u"\nvocalization_dict=" + str( vocalization_dict).replace(",", ",\n") fich = open(self.__dypypath + "vocalizations_dyn.py", "w+") fich.write(raw_str) return raw_str
def make_spellerrors_dict(self): """ make the spell errors dictionary @deprecated: forget this! """ D = QseDocIndex() R = QReader(D) nor = QArabicSymbolsFilter(True, True, True, True).normalize_all spell_err = {} for term in R.reader.all_terms(): if term[0] in ["aya"]: normalized = nor(term[1]) if spell_err.has_key(normalized): spell_err[normalized].append(term[1]) else: spell_err[normalized] = [term[1]] print "\n".join([ unicode(key) + u":" + ",".join(value) for key, value in spell_err.items() ]) raw_str = self.dheader + u"\nspell_err=" + str(spell_err) fich = open(self.__dypypath + "spellerrors_dyn.py", "w+") fich.write(raw_str)
def __init__(self, fieldname, text, boost=1.0): self.fieldname = fieldname self.text = text self.boost = boost ASF = QArabicSymbolsFilter(shaping=False, tashkil=True, spellerrors=False, hamza=False) self.words = [ASF.normalize_all(word) for word in text]
def __init__(self, fieldname, text, boost=1.0): self.fieldname = fieldname self.text = text self.boost = boost self.words = [text] self.ASF = QArabicSymbolsFilter(shaping=True, tashkil=False, spellerrors=True, hamza=True)
def _search_word( self, flags ): """ return the results of word search as a dictionary data structure """ #flags query = flags["query"] if flags.has_key( "query" ) \ else self._defaults["flags"]["query"] sortedby = flags["sortedby"] if flags.has_key( "sortedby" ) \ else self._defaults["flags"]["sortedby"] range = int( flags["perpage"] ) if flags.has_key( "perpage" ) \ else flags["range"] if flags.has_key( "range" ) \ else self._defaults["flags"]["range"] ## offset = (page-1) * perpage -- mode paging offset = ( ( int( flags["page"] ) - 1 ) * range ) + 1 if flags.has_key( "page" ) \ else int( flags["offset"] ) if flags.has_key( "offset" ) \ else self._defaults["flags"]["offset"] romanization = flags["romanization"] if flags.has_key( "romanization" ) \ else self._defaults["flags"]["romanization"] highlight = flags["highlight"] if flags.has_key( "highlight" ) \ else self._defaults["flags"]["highlight"] script = flags["script"] if flags.has_key( "script" ) \ else self._defaults["flags"]["script"] vocalized = TRUE_FALSE( flags["vocalized"] ) if flags.has_key( "vocalized" ) \ else self._defaults["flags"]["vocalized"] view = flags["view"] if flags.has_key( "view" ) \ else self._defaults["flags"]["view"] # pre-defined views if view == "minimal": vocalized = False elif view == "normal": pass elif view == "full": romanization = "iso" elif view == "statistic": pass elif view == "linguistic": romanization = "buckwalter" elif view == "recitation": script = "uthmani" else: # if view == custom or undefined pass #preprocess query query = query.replace( "\\", "" ) if not isinstance( query, unicode ): query = unicode( query , 'utf8' ) if ":" not in query: query = unicode( transliterate( "buckwalter", query, ignore = "'_\"%*?#~[]{}:>+-|" ) ) #Search SE = self.WSE res, termz = SE.search_all( query , self._defaults["results_limit"]["word"], sortedby = sortedby ) terms = [term[1] for term in list( termz )[:self._defaults["maxkeywords"]]] #pagination offset = 1 if offset < 1 else offset; range = self._defaults["minrange"] if range < self._defaults["minrange"] else range; range = self._defaults["maxrange"] if range > self._defaults["maxrange"] else range; interval_end = offset + range - 1 end = interval_end if interval_end < len( res ) else len( res ) start = offset if offset <= len( res ) else -1 reslist = [] if end == 0 or start == -1 else list( res )[start - 1:end] output = {} #if True: ## strip vocalization when vocalized = true V = QArabicSymbolsFilter( \ shaping = False, \ tashkil = not vocalized, \ spellerrors = False, \ hamza = False \ ).normalize_all # highligh function that consider None value and non-definition H = lambda X: SE.highlight( X, terms, highlight ) if highlight != "none" and X else X if X else u"-----" # Numbers are 0 if not defined N = lambda X:X if X else 0 # parse keywords lists , used for Sura names kword = re.compile( u"[^,،]+" ) keywords = lambda phrase: kword.findall( phrase ) ##################################################### extend_runtime = res.runtime # Words & Annotations words_output = {"individual":{}} if True: matches = 0 docs = 0 cpt = 1; for term in termz : if True: #term[0] == "normalized" or term[0] == "word": if term[2]: matches += term[2] docs += term[3] words_output[ "individual" ][ cpt ] = { "field": term[0], "word":term[1], "romanization": transliterate( romanization, term[1], ignore = "" , reverse = True ) if romanization in self.DOMAINS["romanization"] else None, "nb_matches":term[2], "nb_docs":term[3], } cpt += 1 words_output["global"] = {"nb_words":cpt - 1, "nb_matches":matches} output["keywords"] = words_output; output["runtime"] = round( extend_runtime, 5 ) output["interval"] = { "start":start, "end":end, "total": len( res ), "page": ( ( start - 1 ) / range ) + 1, "nb_pages": ( ( len( res ) - 1 ) / range ) + 1 } ### Words cpt = start - 1 output["words"] = {} for r in reslist : cpt += 1 output["words"][ cpt ] = { "identifier": { "gid":r["gid"], "word_gid": r["word_gid"], "aya_id":r["aya_id"], "sura_id":r["sura_id"], "word_id":r["word_id"], }, "word":{ "text": H( V( r["word"] ) ), "part": r["part"], "part_order": r["order"], "token": r["arabictoken"], "POS": { "english": r["pos"], "arabic": r["arabicpos"], }, "mood": { "english": r["mood"], "arabic": r["arabicmood"], }, "case": { "english": r["case"], "arabic": r["arabiccase"], }, "root": { #"english": r["root"], "arabic": r["arabicroot"], }, "lemma": { #"english": r["lemma"], "arabic": r["arabiclemma"], }, "special": { #"english": r["special"], "arabic": r["arabicspecial"], }, "derivation": r["derivation"], "form": r["form"], "gender": r["gender"], "person": r["person"], "number": r["number"], "voice": r["voice"], "state": r["state"], "aspect": r["aspect"], }, } return output
def _search_aya( self, flags ): """ return the results of aya search as a dictionary data structure """ #flags query = flags["query"] if flags.has_key( "query" ) \ else self._defaults["flags"]["query"] sortedby = flags["sortedby"] if flags.has_key( "sortedby" ) \ else self._defaults["flags"]["sortedby"] range = int( flags["perpage"] ) if flags.has_key( "perpage" ) \ else flags["range"] if flags.has_key( "range" ) \ else self._defaults["flags"]["range"] ## offset = (page-1) * perpage -- mode paging offset = ( ( int( flags["page"] ) - 1 ) * range ) + 1 if flags.has_key( "page" ) \ else int( flags["offset"] ) if flags.has_key( "offset" ) \ else self._defaults["flags"]["offset"] recitation = flags["recitation"] if flags.has_key( "recitation" ) \ else self._defaults["flags"]["recitation"] translation = flags["translation"] if flags.has_key( "translation" ) \ else self._defaults["flags"]["translation"] romanization = flags["romanization"] if flags.has_key( "romanization" ) \ else self._defaults["flags"]["romanization"] highlight = flags["highlight"] if flags.has_key( "highlight" ) \ else self._defaults["flags"]["highlight"] script = flags["script"] if flags.has_key( "script" ) \ else self._defaults["flags"]["script"] vocalized = TRUE_FALSE( flags["vocalized"] ) if flags.has_key( "vocalized" ) \ else self._defaults["flags"]["vocalized"] fuzzy = TRUE_FALSE( flags["fuzzy"] ) if flags.has_key( "fuzzy" ) \ else self._defaults["flags"]["fuzzy"] view = flags["view"] if flags.has_key( "view" ) \ else self._defaults["flags"]["view"] # pre-defined views if view == "minimal": #fuzzy = True #page = 25 vocalized = False recitation = None translation = None prev_aya = next_aya = False sura_info = False word_info = False word_synonyms = False word_derivations = False word_vocalizations = False aya_position_info = aya_theme_info = aya_sajda_info = False aya_stat_info = False sura_stat_info = False annotation_aya = annotation_word = False elif view == "normal": prev_aya = next_aya = False sura_info = True word_info = True word_synonyms = False word_derivations = True word_vocalizations = True aya_position_info = aya_theme_info = aya_sajda_info = True aya_stat_info = True sura_stat_info = False annotation_aya = annotation_word = False elif view == "full": prev_aya = next_aya = True sura_info = True word_info = True word_synonyms = True word_derivations = True word_vocalizations = True aya_position_info = aya_theme_info = aya_sajda_info = True aya_stat_info = sura_stat_info = True annotation_aya = annotation_word = False romanization = "iso" elif view == "statistic": prev_aya = next_aya = False sura_info = True word_info = True word_synonyms = False word_derivations = True word_vocalizations = True aya_position_info = True aya_theme_info = aya_sajda_info = False aya_stat_info = True sura_stat_info = True annotation_aya = False annotation_word = False elif view == "linguistic": prev_aya = next_aya = False sura_info = False word_info = True word_synonyms = True word_derivations = True word_vocalizations = True aya_position_info = False aya_theme_info = aya_sajda_info = True aya_stat_info = False sura_stat_info = False annotation_aya = False annotation_word = False romanization = "buckwalter" elif view == "recitation": script = "uthmani" prev_aya = next_aya = True sura_info = True word_info = False word_synonyms = False word_derivations = False word_vocalizations = False aya_position_info = True aya_theme_info = False aya_sajda_info = True aya_stat_info = False sura_stat_info = False annotation_aya = False annotation_word = False else: # if view == custom or undefined prev_aya = TRUE_FALSE( flags["prev_aya"] ) if flags.has_key( "prev_aya" ) \ else self._defaults["flags"]["prev_aya"] next_aya = TRUE_FALSE( flags["next_aya"] ) if flags.has_key( "next_aya" ) \ else self._defaults["flags"]["next_aya"] sura_info = TRUE_FALSE( flags["sura_info"] ) if flags.has_key( "sura_info" ) \ else self._defaults["flags"]["sura_info"] sura_stat_info = TRUE_FALSE( flags["sura_stat_info"] ) if flags.has_key( "sura_stat_info" ) \ else self._defaults["flags"]["sura_stat_info"] word_info = TRUE_FALSE( flags["word_info"] ) if flags.has_key( "word_info" ) \ else self._defaults["flags"]["word_info"] word_synonyms = TRUE_FALSE( flags["word_synonyms"] ) if flags.has_key( "word_synonyms" ) \ else self._defaults["flags"]["word_synonyms"] word_derivations = TRUE_FALSE( flags["word_derivations"] ) if flags.has_key( "word_derivations" ) \ else self._defaults["flags"]["word_derivations"] word_vocalizations = TRUE_FALSE( flags["word_vocalizations"] ) if flags.has_key( "word_vocalizations" ) \ else self._defaults["flags"]["word_vocalizations"] aya_position_info = TRUE_FALSE( flags["aya_position_info"] ) if flags.has_key( "aya_position_info" ) \ else self._defaults["flags"]["aya_position_info"] aya_theme_info = TRUE_FALSE( flags["aya_theme_info"] ) if flags.has_key( "aya_theme_info" ) \ else self._defaults["flags"]["aya_theme_info"] aya_stat_info = TRUE_FALSE( flags["aya_stat_info"] ) if flags.has_key( "aya_stat_info" ) \ else self._defaults["flags"]["aya_stat_info"] aya_sajda_info = TRUE_FALSE( flags["aya_sajda_info"] ) if flags.has_key( "aya_sajda_info" ) \ else self._defaults["flags"]["aya_sajda_info"] annotation_aya = TRUE_FALSE( flags["annotation_aya"] ) if flags.has_key( "annotation_aya" ) \ else self._defaults["flags"]["annotation_aya"] annotation_word = TRUE_FALSE( flags["annotation_word"] ) if flags.has_key( "annotation_word" ) \ else self._defaults["flags"]["annotation_word"] #print query #preprocess query query = query.replace( "\\", "" ) if not isinstance( query, unicode ): query = unicode( query , 'utf8' ) if ":" not in query: query = unicode( transliterate( "buckwalter", query, ignore = "'_\"%*?#~[]{}:>+-|" ) ) #Search SE = self.FQSE if fuzzy else self.QSE res, termz = SE.search_all( query , self._defaults["results_limit"]["aya"], sortedby = sortedby ) terms = [term[1] for term in list( termz )[:self._defaults["maxkeywords"]]] terms_uthmani = map( STANDARD2UTHMANI, terms ) #pagination offset = 1 if offset < 1 else offset; range = self._defaults["minrange"] if range < self._defaults["minrange"] else range; range = self._defaults["maxrange"] if range > self._defaults["maxrange"] else range; interval_end = offset + range - 1 end = interval_end if interval_end < len( res ) else len( res ) start = offset if offset <= len( res ) else -1 reslist = [] if end == 0 or start == -1 else list( res )[start - 1:end] output = {} ## disable annotations for aya words if there is more then one result if annotation_aya and len ( res ) > 1: annotation_aya = False #if True: ## strip vocalization when vocalized = true V = QArabicSymbolsFilter( \ shaping = False, \ tashkil = not vocalized, \ spellerrors = False, \ hamza = False \ ).normalize_all strip_vocalization = QArabicSymbolsFilter( \ shaping = False, \ tashkil = True, \ spellerrors = False, \ hamza = False \ ).normalize_all # highligh function that consider None value and non-definition H = lambda X: self.QSE.highlight( X, terms, highlight ) if highlight != "none" and X else X if X else u"-----" # Numbers are 0 if not defined N = lambda X:X if X else 0 # parse keywords lists , used for Sura names kword = re.compile( u"[^,،]+" ) keywords = lambda phrase: kword.findall( phrase ) ########################################## extend_runtime = res.runtime # Words & Annotations words_output = {"individual":{}} if word_info: matches = 0 docs = 0 nb_vocalizations_globale = 0 cpt = 1; annotation_word_query = u"( 0 " for term in termz : if term[0] == "aya" or term[0] == "aya_": if term[2]: matches += term[2] docs += term[3] if term[0] == "aya_": annotation_word_query += u" OR word:%s " % term[1] else: #if aya annotation_word_query += u" OR normalized:%s " % STANDARD2UTHMANI( term[1] ) if word_vocalizations: vocalizations = vocalization_dict[ strip_vocalization( term[1] ) ] if vocalization_dict.has_key( strip_vocalization( term[1] ) ) \ else [] nb_vocalizations_globale += len( vocalizations ) if word_synonyms: synonyms = syndict[term[1]] if syndict.has_key( term[1] ) \ else [] if word_derivations: lemma = LOCATE( derivedict["word_"], derivedict["lemma"], term[1] ) root = LOCATE( derivedict["word_"], derivedict["root"], term[1] ) if lemma: # if different of none derivations = FILTER_DOUBLES( FIND( derivedict["lemma"], derivedict["word_"], lemma ) ) else: derivations = [] words_output[ "individual" ][ cpt ] = { "word":term[1], "romanization": transliterate( romanization, term[1], ignore = "" , reverse = True ) if romanization in self.DOMAINS["romanization"] else None, "nb_matches":term[2], "nb_ayas":term[3], "nb_vocalizations": len( vocalizations ) if word_vocalizations else 0,#unneeded "vocalizations": vocalizations if word_vocalizations else [], "nb_synonyms": len( synonyms ) if word_synonyms else 0,#unneeded "synonyms": synonyms if word_synonyms else [], "lemma": lemma if word_derivations else "", "root": root if word_derivations else "", "nb_derivations": len( derivations ) if word_derivations else 0, #unneeded "derivations": derivations if word_derivations else [] } cpt += 1 annotation_word_query += u" ) " words_output["global"] = {"nb_words":cpt - 1, "nb_matches":matches, "nb_vocalizations": nb_vocalizations_globale} output["words"] = words_output; #Magic_loop to built queries of Adjacents,translations and annotations in the same time if prev_aya or next_aya or translation or annotation_aya: adja_query = trad_query = annotation_aya_query = u"( 0" for r in reslist : if prev_aya: adja_query += u" OR gid:%s " % unicode( r["gid"] - 1 ) if next_aya: adja_query += u" OR gid:%s " % unicode( r["gid"] + 1 ) if translation: trad_query += u" OR gid:%s " % unicode( r["gid"] ) if annotation_aya: annotation_aya_query += u" OR ( aya_id:%s AND sura_id:%s ) " % ( unicode( r["aya_id"] ) , unicode( r["sura_id"] ) ) adja_query += u" )" trad_query += u" )" + u" AND id:%s " % unicode( translation ) annotation_aya_query += u" )" # Adjacents if prev_aya or next_aya: adja_res = self.QSE.find_extended( adja_query, "gid" ) adja_ayas = {0:{"aya_":u"----", "uth_":u"----", "sura":u"---", "aya_id":0}, 6237:{"aya_":u"----", "uth_":u"----", "sura":u"---", "aya_id":9999}} for adja in adja_res: adja_ayas[adja["gid"]] = {"aya_":adja["aya_"], "uth_":adja["uth_"], "aya_id":adja["aya_id"], "sura":adja["sura"]} extend_runtime += adja_res.runtime #translations if translation: trad_res = self.TSE.find_extended( trad_query, "gid" ) extend_runtime += trad_res.runtime trad_text = {} for tr in trad_res: trad_text[tr["gid"]] = tr["text"] #annotations for aya words if annotation_aya or ( annotation_word and word_info ) : annotation_word_query = annotation_word_query if annotation_word and word_info else u"()" annotation_aya_query = annotation_aya_query if annotation_aya else u"()" annotation_query = annotation_aya_query + u" OR " + annotation_word_query #print annotation_query.encode( "utf-8" ) annot_res = self.WSE.find_extended( annotation_query, "gid" ) extend_runtime += annot_res.runtime ## prepare annotations for use annotations_by_word = {} annotations_by_position = {} for annot in annot_res: if ( annotation_word and word_info ) : if annot["normalized"] in terms_uthmani: if annotations_by_word.has_key( annot["normalized"] ): if annotations_by_word[annot["normalized"]].has_key( annot["word"] ): annotations_by_word[annot["normalized"]][annot["word"]][annot["order"]] = annot; else: annotations_by_word[annot["normalized"]][annot["word"]] = { annot["order"]: annot} ; else: annotations_by_word[annot["normalized"]] = { annot["word"]: { annot["order"]: annot}} if annotation_aya: if annotations_by_position.has_key( ( annot["sura_id"], annot["aya_id"] ) ): annotations_by_position[( annot["sura_id"], annot["aya_id"] )][annot["word_id"]] = annot else: annotations_by_position[( annot["sura_id"], annot["aya_id"] )] = { annot["word_id"]: annot } ## merge word annotations to word output if ( annotation_word and word_info ): for cpt in xrange( 1, len( output["words"]["individual"] ) + 1 ): current_word = STANDARD2UTHMANI( output["words"]["individual"][cpt]["word"] ) #print current_word.encode( "utf-8" ), "=>", annotations_by_word, "=>", list( annot_res ) if annotations_by_word.has_key( current_word ): current_word_annotations = annotations_by_word[ current_word ] output["words"]["individual"][cpt]["annotations"] = current_word_annotations output["words"]["individual"][cpt]["nb_annotations"] = len ( current_word_annotations ) output["runtime"] = round( extend_runtime, 5 ) output["interval"] = { "start":start, "end":end, "total": len( res ), "page": ( ( start - 1 ) / range ) + 1, "nb_pages": ( ( len( res ) - 1 ) / range ) + 1 } output["translation_info"] = {} ### Ayas cpt = start - 1 output["ayas"] = {} for r in reslist : cpt += 1 output["ayas"][ cpt ] = { "identifier": {"gid":r["gid"], "aya_id":r["aya_id"], "sura_id":r["sura_id"], "sura_name":keywords( r["sura"] )[0], }, "aya":{ "id":r["aya_id"], "text": H( V( r["aya_"] ) ) if script == "standard" else H( r["uth_"] ) , "text_no_highlight": V( r["aya_"] ) if script == "standard" else r["uth_"], "translation": trad_text[r["gid"]] if ( translation != "None" and translation and trad_text.has_key( r["gid"] ) ) else None, "recitation": None if not recitation or not self._recitations.has_key( recitation ) \ else u"http://www.everyayah.com/data/" + self._recitations[recitation]["subfolder"].encode( "utf-8" ) + "/%03d%03d.mp3" % ( r["sura_id"], r["aya_id"] ), "prev_aya":{ "id":adja_ayas[r["gid"] - 1]["aya_id"], "sura":adja_ayas[r["gid"] - 1]["sura"], "text": V( adja_ayas[r["gid"] - 1]["aya_"] ) if script == "standard" else adja_ayas[r["gid"] - 1]["uth_"] , } if prev_aya else None , "next_aya":{ "id":adja_ayas[r["gid"] + 1]["aya_id"], "sura":adja_ayas[r["gid"] + 1]["sura"], "text": V( adja_ayas[r["gid"] + 1]["aya_"] ) if script == "standard" else adja_ayas[r["gid"] + 1]["uth_"] , } if next_aya else None , }, "sura": {} if not sura_info else { "name":keywords( r["sura"] )[0] , "id":r["sura_id"], "type": r["sura_type"] , "order":r["sura_order"], "ayas":r["s_a"], "stat":{} if not sura_stat_info else { "words":N( r["s_w"] ), "godnames":N( r["s_g"] ), "letters":N( r["s_l"] ) } }, "position": {} if not aya_position_info else { "manzil":r["manzil"], "juz":r["juz"], "hizb":r["hizb"], "rub":r["rub"] % 4, "page":r["page"], "page_IN":r["page_IN"], "ruku":r["ruku"], }, "theme":{} if not aya_theme_info else { "chapter": r["chapter"], "topic": r["topic"] , "subtopic": r["subtopic"] }, "stat": {} if not aya_stat_info else { "words":N( r["a_w"] ), "letters":N( r["a_l"] ), "godnames":N( r["a_g"] ) } , "sajda":{} if not aya_sajda_info else { "exist":( r["sajda"] == u"نعم" ), "type": r["sajda_type"] if ( r["sajda"] == u"نعم" ) else None, "id":N( r["sajda_id"] ) if ( r["sajda"] == u"نعم" ) else None, }, "annotations": {} if not annotation_aya or not annotations_by_position.has_key( ( r["sura_id"], r["aya_id"] ) ) else annotations_by_position[( r["sura_id"], r["aya_id"] )] } return output
def _search(self, flags): """ return the results of search as json """ #flags query = flags["query"] if flags.has_key( "query") else self._defaults["flags"]["query"] sortedby = flags["sortedby"] if flags.has_key( "sortedby") else self._defaults["flags"]["sortedby"] range = int(flags["perpage"]) if flags.has_key( "perpage") else flags["range"] if flags.has_key( "range") else self._defaults["flags"]["range"] offset = ((int(flags["page"]) - 1) * range) + 1 if flags.has_key( "page") else int(flags["offset"]) if flags.has_key( "offset") else self._defaults["flags"][ "offset"] ## offset = (page-1) * perpage -- mode paging highlight = flags["highlight"] if flags.has_key( "highlight") else self._defaults["flags"]["highlight"] script = flags["script"] if flags.has_key( "script") else self._defaults["flags"]["script"] vocalized = flags["vocalized"] if flags.has_key( "vocalized") else self._defaults["flags"]["vocalized"] recitation = flags["recitation"] if flags.has_key( "recitation") else self._defaults["flags"]["recitation"] translation = flags["translation"] if flags.has_key( "translation") else self._defaults["flags"]["translation"] prev_aya = flags["prev_aya"] if flags.has_key( "prev_aya") else self._defaults["flags"]["prev_aya"] next_aya = flags["next_aya"] if flags.has_key( "next_aya") else self._defaults["flags"]["next_aya"] sura_info = flags["sura_info"] if flags.has_key( "sura_info") else self._defaults["flags"]["sura_info"] word_info = flags["word_info"] if flags.has_key( "word_info") else self._defaults["flags"]["word_info"] aya_position_info = flags["aya_position_info"] if flags.has_key( "aya_position_info" ) else self._defaults["flags"]["aya_position_info"] aya_theme_info = flags["aya_theme_info"] if flags.has_key( "aya_theme_info") else self._defaults["flags"]["aya_theme_info"] aya_stat_info = flags["aya_stat_info"] if flags.has_key( "aya_stat_info") else self._defaults["flags"]["aya_stat_info"] aya_sajda_info = flags["aya_sajda_info"] if flags.has_key( "aya_sajda_info") else self._defaults["flags"]["aya_sajda_info"] annotation_aya = flags["annotation_aya"] if flags.has_key( "annotation_aya") else self._defaults["flags"]["annotation_aya"] annotation_word = flags["annotation_word"] if flags.has_key( "annotation_word") else self._defaults["flags"]["annotation_word"] fuzzy = flags["fuzzy"] if flags.has_key( "fuzzy") else self._defaults["flags"]["fuzzy"] #Search SE = self.FQSE if fuzzy else self.QSE res, termz = SE.search_all(unicode(query.replace("\\", ""), 'utf8'), self._defaults["results_limit"], sortedby=sortedby) terms = [term[1] for term in list(termz) ] # TODO: I dont like this termz structure , must change it terms_uthmani = map(STANDARD2UTHMANI, terms) #pagination offset = 1 if offset < 1 else offset range = self._defaults[ "maxrange"] if range > self._defaults["maxrange"] else range interval_end = offset + range end = interval_end if interval_end < len(res) else len(res) start = offset if offset <= len(res) else -1 reslist = [] if end == 0 or start == -1 else list(res)[start - 1:end] output = {} #if True: ## strip vocalization when vocalized = true V = QArabicSymbolsFilter( \ shaping = False, \ tashkil = not vocalized, \ spellerrors = False, \ hamza = False \ ).normalize_all # highligh function that consider None value and non-definition H = lambda X: self.QSE.highlight( X, terms, highlight ) if highlight != "none" and X else X if X else u"-----" # Numbers are 0 if not defined N = lambda X: X if X else 0 # parse keywords lists , used for Sura names kword = re.compile(u"[^,،]+") keywords = lambda phrase: kword.findall(phrase) # Tamdid devine name to avoid double Shedda on the middle Lam Gword_tamdid = lambda aya: aya.replace(u"لَّه", u"لَّـه").replace( u"لَّه", u"لَّـه") ########################################## extend_runtime = res.runtime # Words & Annotations words_output = {} if word_info: matches = 0 docs = 0 nb_vocalizations_globale = 0 cpt = 1 annotation_word_query = u"( 0 " for term in termz: if term[0] == "aya": if term[2]: matches += term[2] docs += term[3] annotation_word_query += u" OR normalized:%s " % STANDARD2UTHMANI( term[1]) vocalizations = vocalization_dict[term[1]] nb_vocalizations_globale += len(vocalizations) words_output[cpt] = { "word": term[1], "nb_matches": term[2], "nb_ayas": term[3], "nb_vocalizations": len(vocalizations), "vocalizations": vocalizations } cpt += 1 annotation_word_query += u" ) " words_output["global"] = { "nb_words": cpt - 1, "nb_matches": matches, "nb_vocalizations": nb_vocalizations_globale } output["words"] = words_output #Magic_loop to built queries of Adjacents,translations and annotations in the same time if prev_aya or next_aya or translation or annotation_aya: adja_query = trad_query = annotation_aya_query = u"( 0" for r in reslist: if prev_aya: adja_query += u" OR gid:%s " % unicode(r["gid"] - 1) if next_aya: adja_query += u" OR gid:%s " % unicode(r["gid"] + 1) if translation: trad_query += u" OR gid:%s " % unicode(r["gid"]) if annotation_aya: annotation_aya_query += u" OR ( aya_id:%s AND sura_id:%s ) " % ( unicode(r["aya_id"]), unicode(r["sura_id"])) adja_query += u" )" trad_query += u" )" + u" AND id:%s " % unicode(translation) annotation_aya_query += u" )" # Adjacents if prev_aya or next_aya: adja_res = self.QSE.find_extended(adja_query, "gid") adja_ayas = { 0: { "aya_": u"----", "uth_": u"----", "sura": u"---", "aya_id": 0 }, 6237: { "aya_": u"----", "uth_": u"----", "sura": u"---", "aya_id": 9999 } } for adja in adja_res: adja_ayas[adja["gid"]] = { "aya_": adja["aya_"], "uth_": adja["uth_"], "aya_id": adja["aya_id"], "sura": adja["sura"] } extend_runtime += adja_res.runtime #translations if translation: trad_res = self.TSE.find_extended(trad_query, "gid") extend_runtime += trad_res.runtime trad_text = {} for tr in trad_res: trad_text[tr["gid"]] = tr["text"] #annotations for aya words if annotation_aya or (annotation_word and word_info): annotation_word_query = annotation_word_query if annotation_word and word_info else u"()" annotation_aya_query = annotation_aya_query if annotation_aya else u"()" annotation_query = annotation_aya_query + u" OR " + annotation_word_query #print annotation_query.encode( "utf-8" ) annot_res = self.WSE.find_extended(annotation_query, "gid") extend_runtime += annot_res.runtime ## prepare annotations for use annotations_by_word = {} annotations_by_position = {} for annot in annot_res: if (annotation_word and word_info): if annot["normalized"] in terms_uthmani: if annotations_by_word.has_key(annot["normalized"]): annotations_by_word[annot["normalized"]][ annot["word"]] = annot else: annotations_by_word[annot["normalized"]] = { annot["word"]: annot } if annotation_aya: if annotations_by_position.has_key( (annot["sura_id"], annot["aya_id"])): annotations_by_position[( annot["sura_id"], annot["aya_id"])][annot["word_id"]] = annot else: annotations_by_position[(annot["sura_id"], annot["aya_id"])] = { annot["word_id"]: annot } ## merge word annotations to word output if (annotation_word and word_info): for cpt in xrange(1, len(termz) + 1): current_word = STANDARD2UTHMANI(output["words"][cpt]["word"]) #print current_word.encode( "utf-8" ), "=>", annotations_by_word, "=>", list( annot_res ) if annotations_by_word.has_key(current_word): current_word_annotations = annotations_by_word[ current_word] output["words"][cpt][ "annotations"] = current_word_annotations output["words"][cpt]["nb_annotations"] = len( current_word_annotations) output["runtime"] = extend_runtime output["interval"] = {"start": start, "end": end, "total": len(res)} output["translation_info"] = {} ### Ayas cpt = start - 1 output["ayas"] = {} for r in reslist: cpt += 1 output["ayas"][cpt] = { "identifier": { "gid": r["gid"], "aya_id": r["aya_id"], "sura_id": r["sura_id"], "sura_name": keywords(r["sura"])[0], }, "aya": { "id": r["aya_id"], "text": Gword_tamdid(H(V(r["aya_"]))) if script == "standard" else Gword_tamdid(H(r["uth_"])), "translation": trad_text[r["gid"]] if (translation != "None" and translation and trad_text.has_key(r["gid"])) else None, "recitation": None if not recitation else u"http://www.everyayah.com/data/" + self._recitations[recitation]["subfolder"].encode("utf-8") + "/%03d%03d.mp3" % (r["sura_id"], r["aya_id"]), "prev_aya": { "id": adja_ayas[r["gid"] - 1]["aya_id"], "sura": adja_ayas[r["gid"] - 1]["sura"], "text": Gword_tamdid(V(adja_ayas[r["gid"] - 1]["aya_"])) if script == "standard" else Gword_tamdid(adja_ayas[r["gid"] - 1]["uth_"]), } if prev_aya else None, "next_aya": { "id": adja_ayas[r["gid"] + 1]["aya_id"], "sura": adja_ayas[r["gid"] + 1]["sura"], "text": Gword_tamdid(V(adja_ayas[r["gid"] + 1]["aya_"])) if script == "standard" else Gword_tamdid(adja_ayas[r["gid"] + 1]["uth_"]), } if next_aya else None, }, "sura": {} if not sura_info else { "name": keywords(r["sura"])[0], "id": r["sura_id"], "type": r["sura_type"], "order": r["sura_order"], "stat": { "ayas": r["s_a"], "words": N(r["s_w"]), "godnames": N(r["s_g"]), "letters": N(r["s_l"]) } }, "position": {} if not aya_position_info else { "manzil": r["manzil"], "hizb": r["hizb"], "rub": r["rub"] % 4, "page": r["page"], "ruku": r["ruku"], }, "theme": {} if not aya_theme_info else { "chapter": r["chapter"], "topic": r["topic"], "subtopic": r["subtopic"] }, "stat": {} if not aya_stat_info else { "words": N(r["a_w"]), "letters": N(r["a_l"]), "godnames": N(r["a_g"]) }, "sajda": {} if not aya_sajda_info else { "exist": (r["sajda"] == u"نعم"), "type": r["sajda_type"] if (r["sajda"] == u"نعم") else None, "id": N(r["sajda_id"]) if (r["sajda"] == u"نعم") else None, }, "annotations": {} if not annotation_aya or not annotations_by_position.has_key( (r["sura_id"], r["aya_id"])) else annotations_by_position[(r["sura_id"], r["aya_id"])] } return {"search": output}
# coding: utf-8 """ This is a test module for alfanous.TextProcessing """ from alfanous.TextProcessing import QArabicSymbolsFilter, unicode_ if __name__ == "__main__": ASF = QArabicSymbolsFilter() TEXT = u"عاصِمٌ" TEXT = ASF.normalize_all(TEXT) print TEXT WORD1 = unicode_(u"عَاصِمُ") WORD2 = unicode_(u"عَاصمُ") LIST_HARAKAT1 = WORD1.list_harakat() LIST_HARAKAT2 = WORD2.list_harakat() WORD3 = unicode_(u"فاعل") PHRASE = unicode_(u"كانَ") print WORD3.apply_harakat_list(LIST_HARAKAT1) print LIST_HARAKAT1, "\n", LIST_HARAKAT2 print unicode_.compare_harakat(LIST_HARAKAT1, LIST_HARAKAT2) print WORD1.shakl_compare(WORD1, WORD2) for i in PHRASE.tokenize_shakl(): print i, WORD4 = unicode_(u"عاصم") WORD5 = unicode_(u"عاصِم") print WORD4 == WORD5
def __init__( self, QC_PATH = "../../store/quranic-corpus-morpology.xml", DB = "main.db" ): """ make word table """ import sqlite3 print "connecting to database ...", maindb = sqlite3.connect( DB ) cur = maindb.cursor() print "OK" print "creating tables:" cur.execute( """ drop table if exists wordQC""" ) cur.execute( """ create table if not exists wordQC( gid int unique, word_gid int, word_id int, aya_id int, sura_id int, word varchar(25), normalised varchar(25), spelled varchar(25), 'order' int, token varchar(25), arabictoken varchar(25), prefixes varchar(25), suffixes varchar(25), pos varchar(25), type varchar(25), arabicpos varchar(25), mood varchar(25), arabicmood varchar(25), 'case' varchar(25), arabiccase varchar(25), root varchar(25), arabicroot varchar(25), lemma varchar(25), arabiclemma varchar(25), special varchar(25), arabicspecial varchar(25), derivation varchar(25), form varchar(25), gender varchar(25), person varchar(25), number varchar(25), voice varchar(25), state varchar(25), aspect varchar(25), primary key(gid) ) """ ) print ">wordQC table ... OK" print ">loading Qurany Corpus...", from PyCorpus.QuranyCorpus import API as QC A = QC( source = QC_PATH ) print ".OK\n" IFEXIST = lambda d, attrib: d[attrib].encode( "utf-8" ) if attrib in d else "" gid, word_gid = 0, 0 print ">inserting values of gid...", for iteration in A.all_words_generator(): QASF = QArabicSymbolsFilter( shaping = True, tashkil = True, spellerrors = False, hamza = False, uthmani_symbols = True ) QASF_spelled = QArabicSymbolsFilter( shaping = True, tashkil = True, spellerrors = True, hamza = True, uthmani_symbols = True ) QUERY = lambda d, glob: """insert into wordQC(gid,word_gid,word_id,aya_id,sura_id,'order',token,arabictoken,prefixes, suffixes,type,pos,arabicpos,mood, arabicmood, 'case', arabiccase, root ,arabicroot, lemma ,arabiclemma, special, arabicspecial, word,normalised,spelled, derivation, form ,gender, person, number,voice, state, aspect) values ("%(gid)d","%(word_gid)d","%(word_id)d","%(aya_id)d","%(sura_id)d","%(order)d","%(token)s","%(arabictoken)s", "%(prefixes)s", "%(suffixes)s", "%(type)s","%(pos)s","%(arabicpos)s","%(mood)s","%(arabicmood)s", "%(case)s","%(arabiccase)s","%(root)s","%(arabicroot)s","%(lemma)s","%(arabiclemma)s","%(special)s","%(arabicspecial)s","%(word)s","%(normalised)s","%(spelled)s", "%(derivation)s","%(form)s","%(gender)s","%(person)s","%(number)s","%(voice)s","%(state)s","%(aspect)s")""" % { "gid":gid, "word_gid":word_gid, "word_id":iteration["word_id"], "aya_id":iteration["aya_id"], "sura_id":iteration["sura_id"], "order":order, "token":IFEXIST( d, "token" ), "arabictoken":IFEXIST( d, "arabictoken" ), "prefixes":";".join([prefix["arabictoken"] for prefix in glob["prefixes"] ]).encode( "utf-8" ), "suffixes":";".join([suffix["arabictoken"] for suffix in glob["suffixes"] ]).encode( "utf-8" ), "type":IFEXIST( d, "type" ), "pos":IFEXIST( d, "pos" ), "arabicpos":IFEXIST( d, "arabicpos" ), "mood":IFEXIST( d, "mood" ), "arabicmood":IFEXIST( d, "arabicmood" ), "case":IFEXIST( d, "case" ), "arabiccase":IFEXIST( d, "arabiccase" ), "root":IFEXIST( d, "root" ), "arabicroot":IFEXIST( d, "arabicroot" ), "lemma":IFEXIST( d, "lemma" ), "arabiclemma":IFEXIST( d, "arabiclemma" ), "special":IFEXIST( d, "special" ), "arabicspecial":IFEXIST( d, "arabicspecial" ), "word":iteration["word"].encode( "utf-8" ), "normalised": QASF.normalize_all( iteration["word"] ).encode( "utf-8" ), "spelled": QASF_spelled.normalize_all( iteration["word"] ).encode( "utf-8" ), "derivation":IFEXIST( d, "derivation" ), "form":IFEXIST( d, "form" ), "gender":IFEXIST( d, "gender" ), "person":IFEXIST( d, "person" ), "number":IFEXIST( d, "number" ), "voice":IFEXIST( d, "voice" ), "state":IFEXIST( d, "state" ), "aspect":IFEXIST( d, "aspect" ) } word_gid += 1 if word_gid % 1000 == 0: print word_gid, print("\n") order = 0 for d in iteration["morphology"]["base"]: gid += 1 order += 1 cur.execute( QUERY( d, iteration["morphology"] ) ) print("OK") maindb.commit()