class SpellErrors(QMultiTerm): """ query that ignores the spell errors of arabic letters such as: - ta' marbuta and ha' - alef maqsura and ya' - hamza forms """ def __init__(self, fieldname, text, boost=1.0): self.fieldname = fieldname self.text = text self.boost = boost self.words = [text] self.ASF = QArabicSymbolsFilter(shaping=True, tashkil=False, spellerrors=True, hamza=True) def _words(self, ixreader): for field, indexed_text in ixreader.all_terms(): if field == self.fieldname: if self._compare(self.text, indexed_text): yield indexed_text def _compare(self, first, second): """ normalize and compare """ if first[:2] == u"مو": print first eqiv = (self.ASF.normalize_all(first) == self.ASF.normalize_all( second)) if eqiv: self.words.append(second) return eqiv
def __init__(self, fieldname, text, boost=1.0): self.fieldname = fieldname self.text = text self.boost = boost ASF = QArabicSymbolsFilter(shaping=False, tashkil=True, spellerrors=False, hamza=False) self.words = [ASF.normalize_all(word) for word in text]
def __init__(self, fieldname, text, boost=1.0): self.fieldname = fieldname self.text = text self.boost = boost self.words = [text] self.ASF = QArabicSymbolsFilter(shaping=True, tashkil=False, spellerrors=True, hamza=True)
def transfer_vocalizations(self): """ load indexed vocalized words from the main index and save them as a list in a dynamic py """ QSE = QuranicSearchEngine(self.__ixpath) if QSE.OK: mfw = QSE.most_frequent_words(9999999, "aya_") else: mfw = [] V = QArabicSymbolsFilter( \ shaping = False, \ tashkil = True, \ spellerrors = False, \ hamza = False \ ).normalize_all vocalization_dict = {} for w in mfw: word = w[1] if vocalization_dict.has_key(V(word)): vocalization_dict[V(word)].append(word) else: vocalization_dict[V(word)] = [word] raw_str = self.dheader + u"\nvocalization_dict=" + str( vocalization_dict).replace(",", ",\n") fich = open(self.__dypypath + "vocalizations_dyn.py", "w+") fich.write(raw_str) return raw_str
def make_spellerrors_dict(self): """ make the spell errors dictionary @deprecated: forget this! """ D = QseDocIndex() R = QReader(D) nor = QArabicSymbolsFilter(True, True, True, True).normalize_all spell_err = {} for term in R.reader.all_terms(): if term[0] in ["aya"]: normalized = nor(term[1]) if spell_err.has_key(normalized): spell_err[normalized].append(term[1]) else: spell_err[normalized] = [term[1]] #print "\n".join( [unicode( key ) + u":" + ",".join( value ) for key, value in spell_err.items()] ) raw_str = self.dheader + u"\nspell_err=" + str(spell_err) fich = open(self.__dypypath + "spellerrors_dyn.py", "w+") fich.write(raw_str)
def test_arabic_symbol_filter(): ASF = QArabicSymbolsFilter() assert ASF.normalize_all(u"عاصِمٌ") == u"عاصم"
def __init__(self, QC_PATH="../../../store/quranic-corpus-morpology.xml", DB="main.db"): import sqlite3 print "connecting to database ...", maindb = sqlite3.connect(DB) cur = maindb.cursor() print "OK" print "creating tables:" cur.execute(""" drop table if exists wordQC""") cur.execute(""" create table if not exists wordQC( gid int unique, word_gid int, word_id int, aya_id int, sura_id int, word varchar(25), normalised varchar(25), spelled varchar(25), 'order' int, token varchar(25), arabictoken varchar(25), prefixes varchar(25), suffixes varchar(25), pos varchar(25), type varchar(25), arabicpos varchar(25), mood varchar(25), arabicmood varchar(25), 'case' varchar(25), arabiccase varchar(25), root varchar(25), arabicroot varchar(25), lemma varchar(25), arabiclemma varchar(25), special varchar(25), arabicspecial varchar(25), derivation varchar(25), form varchar(25), gender varchar(25), person varchar(25), number varchar(25), voice varchar(25), state varchar(25), aspect varchar(25), primary key(gid) ) """) print ">wordQC table ... OK" print ">loading Qurany Corpus...", from quran_corpus_reader.main import API as QC A = QC(source=QC_PATH) print ".OK\n" IFEXIST = lambda d, attrib: d[attrib].encode("utf-8" ) if attrib in d else "" gid, word_gid = 0, 0 print ">inserting values of gid...", for iteration in A.all_words_generator(): QASF = QArabicSymbolsFilter(shaping=True, tashkil=True, spellerrors=False, hamza=False, uthmani_symbols=True) QASF_spelled = QArabicSymbolsFilter(shaping=True, tashkil=True, spellerrors=True, hamza=True, uthmani_symbols=True) QUERY = lambda d, glob: """insert into wordQC(gid,word_gid,word_id,aya_id,sura_id,'order',token,arabictoken,prefixes, suffixes,type,pos,arabicpos,mood, arabicmood, 'case', arabiccase, root ,arabicroot, lemma ,arabiclemma, special, arabicspecial, word,normalised,spelled, derivation, form ,gender, person, number,voice, state, aspect) values ("%(gid)d","%(word_gid)d","%(word_id)d","%(aya_id)d","%(sura_id)d","%(order)d","%(token)s","%(arabictoken)s", "%(prefixes)s", "%(suffixes)s", "%(type)s","%(pos)s","%(arabicpos)s","%(mood)s","%(arabicmood)s", "%(case)s","%(arabiccase)s","%(root)s","%(arabicroot)s","%(lemma)s","%(arabiclemma)s","%(special)s","%(arabicspecial)s","%(word)s","%(normalised)s","%(spelled)s", "%(derivation)s","%(form)s","%(gender)s","%(person)s","%(number)s","%(voice)s","%(state)s","%(aspect)s")""" % { "gid": gid, "word_gid": word_gid, "word_id": iteration["word_id"], "aya_id": iteration["aya_id"], "sura_id": iteration["sura_id"], "order": order, "token": IFEXIST(d, "token"), "arabictoken": IFEXIST(d, "arabictoken"), "prefixes": ";".join( [prefix["arabictoken"] for prefix in glob["prefixes"]]).encode("utf-8"), "suffixes": ";".join( [suffix["arabictoken"] for suffix in glob["suffixes"]]).encode("utf-8"), "type": IFEXIST(d, "type"), "pos": IFEXIST(d, "pos"), "arabicpos": IFEXIST(d, "arabicpos"), "mood": IFEXIST(d, "mood"), "arabicmood": IFEXIST(d, "arabicmood"), "case": IFEXIST(d, "case"), "arabiccase": IFEXIST(d, "arabiccase"), "root": IFEXIST(d, "root"), "arabicroot": IFEXIST(d, "arabicroot"), "lemma": IFEXIST(d, "lemma"), "arabiclemma": IFEXIST(d, "arabiclemma"), "special": IFEXIST(d, "special"), "arabicspecial": IFEXIST(d, "arabicspecial"), "word": iteration["word"].encode("utf-8"), "normalised": QASF.normalize_all(iteration["word"]).encode("utf-8"), "spelled": QASF_spelled.normalize_all(iteration["word"]).encode("utf-8"), "derivation": IFEXIST(d, "derivation"), "form": IFEXIST(d, "form"), "gender": IFEXIST(d, "gender"), "person": IFEXIST(d, "person"), "number": IFEXIST(d, "number"), "voice": IFEXIST(d, "voice"), "state": IFEXIST(d, "state"), "aspect": IFEXIST(d, "aspect") } word_gid += 1 if word_gid % 1000 == 0: print word_gid, print("\n") order = 0 for d in iteration["morphology"]["base"]: gid += 1 order += 1 cur.execute(QUERY(d, iteration["morphology"])) print("OK") maindb.commit()
# coding: utf-8 """ This is a test module for alfanous.TextProcessing """ from alfanous.text_processing import QArabicSymbolsFilter, unicode_ if __name__ == "__main__": ASF = QArabicSymbolsFilter() TEXT = "عاصِمٌ" TEXT = ASF.normalize_all(TEXT) print(TEXT) WORD1 = unicode_("عَاصِمُ") WORD2 = unicode_("عَاصمُ") LIST_HARAKAT1 = WORD1.list_harakat() LIST_HARAKAT2 = WORD2.list_harakat() WORD3 = unicode_("فاعل") PHRASE = unicode_("كانَ") print(WORD3.apply_harakat_list(LIST_HARAKAT1)) print(LIST_HARAKAT1, "\n", LIST_HARAKAT2) print(unicode_.compare_harakat(LIST_HARAKAT1, LIST_HARAKAT2)) print(WORD1.shakl_compare(WORD1, WORD2)) for i in PHRASE.tokenize_shakl(): print(i, end='') WORD4 = unicode_("عاصم") WORD5 = unicode_("عاصِم") print(WORD4 == WORD5)
def _search_aya(self, flags): """ return the results of aya search as a dictionary data structure """ # flags query = flags["query"] if flags.has_key("query") \ else self._defaults["flags"]["query"] sortedby = flags["sortedby"] if flags.has_key("sortedby") \ else self._defaults["flags"]["sortedby"] range = int(flags["perpage"]) if flags.has_key("perpage") \ else flags["range"] if flags.has_key("range") \ else self._defaults["flags"]["range"] ## offset = (page-1) * perpage -- mode paging offset = ((int(flags["page"]) - 1) * range) + 1 if flags.has_key("page") \ else int(flags["offset"]) if flags.has_key("offset") \ else self._defaults["flags"]["offset"] recitation = flags["recitation"] if flags.has_key("recitation") \ else self._defaults["flags"]["recitation"] translation = flags["translation"] if flags.has_key("translation") \ else self._defaults["flags"]["translation"] romanization = flags["romanization"] if flags.has_key("romanization") \ else self._defaults["flags"]["romanization"] highlight = flags["highlight"] if flags.has_key("highlight") \ else self._defaults["flags"]["highlight"] script = flags["script"] if flags.has_key("script") \ else self._defaults["flags"]["script"] vocalized = IS_FLAG(flags, 'vocalized') fuzzy = IS_FLAG(flags, 'fuzzy') view = flags["view"] if flags.has_key("view") \ else self._defaults["flags"]["view"] # pre-defined views if view == "minimal": # fuzzy = True # page = 25 vocalized = False recitation = None translation = None prev_aya = next_aya = False sura_info = False word_info = False word_synonyms = False word_derivations = False word_vocalizations = False aya_position_info = aya_theme_info = aya_sajda_info = False aya_stat_info = False sura_stat_info = False annotation_aya = annotation_word = False elif view == "normal": prev_aya = next_aya = True sura_info = True word_info = True word_synonyms = False word_derivations = True word_vocalizations = True aya_position_info = aya_theme_info = aya_sajda_info = True aya_stat_info = True sura_stat_info = False annotation_aya = annotation_word = False elif view == "full": prev_aya = next_aya = True sura_info = True word_info = True word_synonyms = True word_derivations = True word_vocalizations = True aya_position_info = aya_theme_info = aya_sajda_info = True aya_stat_info = sura_stat_info = True annotation_aya = annotation_word = True romanization = "iso" elif view == "statistic": prev_aya = next_aya = False sura_info = True word_info = True word_synonyms = False word_derivations = True word_vocalizations = True aya_position_info = True aya_theme_info = aya_sajda_info = False aya_stat_info = True sura_stat_info = True annotation_aya = False annotation_word = False elif view == "linguistic": prev_aya = next_aya = False sura_info = False word_info = True word_synonyms = True word_derivations = True word_vocalizations = True aya_position_info = False aya_theme_info = aya_sajda_info = True aya_stat_info = False sura_stat_info = False annotation_aya = True annotation_word = True romanization = "buckwalter" elif view == "recitation": script = "uthmani" prev_aya = next_aya = True sura_info = True word_info = False word_synonyms = False word_derivations = False word_vocalizations = False aya_position_info = True aya_theme_info = False aya_sajda_info = True aya_stat_info = False sura_stat_info = False annotation_aya = False annotation_word = False else: # if view == custom or undefined prev_aya = IS_FLAG(flags, 'prev_aya') next_aya = IS_FLAG(flags, 'next_aya') sura_info = IS_FLAG(flags, 'sura_info') sura_stat_info = IS_FLAG(flags, 'sura_stat_info') word_info = IS_FLAG(flags, 'word_info') word_synonyms = IS_FLAG(flags, 'word_synonyms') word_derivations = IS_FLAG(flags, 'word_derivations') word_vocalizations = IS_FLAG(flags, 'word_vocalizations') aya_position_info = IS_FLAG(flags, 'aya_position_info') aya_theme_info = IS_FLAG(flags, 'aya_theme_info') aya_stat_info = IS_FLAG(flags, 'aya_stat_info') aya_sajda_info = IS_FLAG(flags, 'aya_sajda_info') annotation_aya = IS_FLAG(flags, 'annotation_aya') annotation_word = IS_FLAG(flags, 'annotation_word') # print query # preprocess query query = query.replace("\\", "") if not isinstance(query, unicode): query = unicode(query, 'utf8') if ":" not in query: query = unicode( transliterate("buckwalter", query, ignore="'_\"%*?#~[]{}:>+-|")) # Search SE = self.FQSE if fuzzy else self.QSE res, termz, searcher = SE.search_all( query, self._defaults["results_limit"]["aya"], sortedby=sortedby) terms = [ term[1] for term in list(termz)[:self._defaults["maxkeywords"]] ] terms_uthmani = map(STANDARD2UTHMANI, terms) # pagination offset = 1 if offset < 1 else offset range = self._defaults[ "minrange"] if range < self._defaults["minrange"] else range range = self._defaults[ "maxrange"] if range > self._defaults["maxrange"] else range interval_end = offset + range - 1 end = interval_end if interval_end < len(res) else len(res) start = offset if offset <= len(res) else -1 reslist = [] if end == 0 or start == -1 else list(res)[start - 1:end] # todo pagination should be done inside search operation for better performence # closing the searcher searcher.close() output = {} ## disable annotations for aya words if there is more then one result if annotation_aya and len(res) > 1: annotation_aya = False # if True: ## strip vocalization when vocalized = true V = QArabicSymbolsFilter( \ shaping=False, \ tashkil=not vocalized, \ spellerrors=False, \ hamza=False \ ).normalize_all strip_vocalization = QArabicSymbolsFilter( \ shaping=False, \ tashkil=True, \ spellerrors=False, \ hamza=False \ ).normalize_all # highligh function that consider None value and non-definition H = lambda X: self.QSE.highlight( X, terms, highlight ) if highlight != "none" and X else X if X else u"-----" # Numbers are 0 if not defined N = lambda X: X if X else 0 # parse keywords lists , used for Sura names kword = re.compile(u"[^,،]+") keywords = lambda phrase: kword.findall(phrase) ########################################## extend_runtime = res.runtime # Words & Annotations words_output = {"individual": {}} if word_info: matches = 0 docs = 0 nb_vocalizations_globale = 0 cpt = 1 annotation_word_query = u"( 0 " for term in termz: if term[0] == "aya" or term[0] == "aya_": if term[2]: matches += term[2] docs += term[3] if term[0] == "aya_": annotation_word_query += u" OR word:%s " % term[1] else: # if aya annotation_word_query += u" OR normalized:%s " % STANDARD2UTHMANI( term[1]) if word_vocalizations: vocalizations = vocalization_dict[strip_vocalization(term[1])] if vocalization_dict.has_key( strip_vocalization(term[1])) \ else [] nb_vocalizations_globale += len(vocalizations) if word_synonyms: synonyms = syndict[term[1]] if syndict.has_key(term[1]) \ else [] derivations_extra = [] if word_derivations: lemma = LOCATE(derivedict["word_"], derivedict["lemma"], term[1]) if lemma: # if different of none derivations = FILTER_DOUBLES( FIND(derivedict["lemma"], derivedict["word_"], lemma)) else: derivations = [] # go deeper with derivations root = LOCATE(derivedict["word_"], derivedict["root"], term[1]) if root: # if different of none derivations_extra = list( set( FILTER_DOUBLES( FIND(derivedict["root"], derivedict["word_"], lemma))) - set(derivations)) words_output["individual"][cpt] = { "word": term[1], "romanization": transliterate( romanization, term[1], ignore="", reverse=True) if romanization in self.DOMAINS["romanization"] else None, "nb_matches": term[2], "nb_ayas": term[3], "nb_vocalizations": len(vocalizations) if word_vocalizations else 0, # unneeded "vocalizations": vocalizations if word_vocalizations else [], "nb_synonyms": len(synonyms) if word_synonyms else 0, # unneeded "synonyms": synonyms if word_synonyms else [], "lemma": lemma if word_derivations else "", "root": root if word_derivations else "", "nb_derivations": len(derivations) if word_derivations else 0, # unneeded "derivations": derivations if word_derivations else [], "nb_derivations_extra": len(derivations_extra), "derivations_extra": derivations_extra, } cpt += 1 annotation_word_query += u" ) " words_output["global"] = { "nb_words": cpt - 1, "nb_matches": matches, "nb_vocalizations": nb_vocalizations_globale } output["words"] = words_output # Magic_loop to built queries of Adjacents,translations and annotations in the same time if prev_aya or next_aya or translation or annotation_aya: adja_query = trad_query = annotation_aya_query = u"( 0" for r in reslist: if prev_aya: adja_query += u" OR gid:%s " % unicode(r["gid"] - 1) if next_aya: adja_query += u" OR gid:%s " % unicode(r["gid"] + 1) if translation: trad_query += u" OR gid:%s " % unicode(r["gid"]) if annotation_aya: annotation_aya_query += u" OR ( aya_id:%s AND sura_id:%s ) " % ( unicode(r["aya_id"]), unicode(r["sura_id"])) adja_query += u" )" trad_query += u" )" + u" AND id:%s " % unicode(translation) annotation_aya_query += u" )" # Adjacents if prev_aya or next_aya: adja_res, searcher = self.QSE.find_extended(adja_query, "gid") adja_ayas = { 0: { "aya_": u"----", "uth_": u"----", "sura": u"---", "aya_id": 0, "sura_arabic": u"---" }, 6237: { "aya_": u"----", "uth_": u"----", "sura": u"---", "aya_id": 9999, "sura_arabic": u"---" } } for adja in adja_res: adja_ayas[adja["gid"]] = { "aya_": adja["aya_"], "uth_": adja["uth_"], "aya_id": adja["aya_id"], "sura": adja["sura"], "sura_arabic": adja["sura_arabic"] } extend_runtime += adja_res.runtime searcher.close() # translations if translation: trad_res, searcher = self.TSE.find_extended(trad_query, "gid") extend_runtime += trad_res.runtime trad_text = {} for tr in trad_res: trad_text[tr["gid"]] = tr["text"] searcher.close() # annotations for aya words if annotation_aya or (annotation_word and word_info): annotation_word_query = annotation_word_query if annotation_word and word_info else u"()" annotation_aya_query = annotation_aya_query if annotation_aya else u"()" annotation_query = annotation_aya_query + u" OR " + annotation_word_query # print annotation_query.encode( "utf-8" ) annot_res, searcher = self.WSE.find_extended( annotation_query, "gid") extend_runtime += annot_res.runtime ## prepare annotations for use annotations_by_word = {} annotations_by_position = {} for annot in annot_res: if (annotation_word and word_info): if annot["normalized"] in terms_uthmani: if annotations_by_word.has_key(annot["normalized"]): if annotations_by_word[ annot["normalized"]].has_key( annot["word"]): annotations_by_word[annot["normalized"]][ annot["word"]].append(annot) else: annotations_by_word[annot["normalized"]][ annot["word"]] = [annot] else: annotations_by_word[annot["normalized"]] = { annot["word"]: [annot] } if annotation_aya: if annotations_by_position.has_key( (annot["sura_id"], annot["aya_id"])): annotations_by_position[( annot["sura_id"], annot["aya_id"])][annot["word_id"]] = annot else: annotations_by_position[(annot["sura_id"], annot["aya_id"])] = { annot["word_id"]: annot } searcher.close() ## merge word annotations to word output if (annotation_word and word_info): for cpt in xrange(1, len(output["words"]["individual"]) + 1): current_word = STANDARD2UTHMANI( output["words"]["individual"][cpt]["word"]) # print current_word.encode( "utf-8" ), "=>", annotations_by_word, "=>", list( annot_res ) if annotations_by_word.has_key(current_word): current_word_annotations = annotations_by_word[ current_word] output["words"]["individual"][cpt][ "annotations"] = current_word_annotations output["words"]["individual"][cpt]["nb_annotations"] = len( current_word_annotations) output["runtime"] = round(extend_runtime, 5) output["interval"] = { "start": start, "end": end, "total": len(res), "page": ((start - 1) / range) + 1, "nb_pages": ((len(res) - 1) / range) + 1 } output["translation_info"] = {} ### Ayas cpt = start - 1 output["ayas"] = {} for r in reslist: cpt += 1 output["ayas"][cpt] = { "identifier": {"gid": r["gid"], "aya_id": r["aya_id"], "sura_id": r["sura_id"], "sura_name": keywords(r["sura"])[0], "sura_arabic_name": keywords(r["sura_arabic"])[0], }, "aya": { "id": r["aya_id"], "text": H(V(r["aya_"])) if script == "standard" else H(r["uth_"]), "text_no_highlight": r["aya"] if script == "standard" else r["uth_"], "translation": trad_text[r["gid"]] if ( translation != "None" and translation and trad_text.has_key(r["gid"])) else None, "recitation": None if not recitation or not self._recitations.has_key(recitation) \ else u"https://www.everyayah.com/data/" + self._recitations[recitation]["subfolder"].encode( "utf-8") + "/%03d%03d.mp3" % (r["sura_id"], r["aya_id"]), "prev_aya": { "id": adja_ayas[r["gid"] - 1]["aya_id"], "sura": adja_ayas[r["gid"] - 1]["sura"], "sura_arabic": adja_ayas[r["gid"] - 1]["sura_arabic"], "text": V(adja_ayas[r["gid"] - 1]["aya_"]) if script == "standard" else adja_ayas[r["gid"] - 1]["uth_"], } if prev_aya else None , "next_aya": { "id": adja_ayas[r["gid"] + 1]["aya_id"], "sura": adja_ayas[r["gid"] + 1]["sura"], "sura_arabic": adja_ayas[r["gid"] + 1]["sura_arabic"], "text": V(adja_ayas[r["gid"] + 1]["aya_"]) if script == "standard" else adja_ayas[r["gid"] + 1]["uth_"], } if next_aya else None , }, "sura": {} if not sura_info else { "name": keywords(r["sura"])[0], "arabic_name": keywords(r["sura_arabic"])[0], "english_name": keywords(r["sura_english"])[0], "id": r["sura_id"], "type": r["sura_type"], "arabic_type": r["sura_type_arabic"], "order": r["sura_order"], "ayas": r["s_a"], "stat": {} if not sura_stat_info else { "words": N(r["s_w"]), "godnames": N(r["s_g"]), "letters": N(r["s_l"]) } }, "position": {} if not aya_position_info else { "manzil": r["manzil"], "juz": r["juz"], "hizb": r["hizb"], "rub": r["rub"] % 4, "page": r["page"], "page_IN": r["page_IN"], "ruku": r["ruku"], }, "theme": {} if not aya_theme_info else { "chapter": r["chapter"], "topic": r["topic"], "subtopic": r["subtopic"] }, "stat": {} if not aya_stat_info else { "words": N(r["a_w"]), "letters": N(r["a_l"]), "godnames": N(r["a_g"]) }, "sajda": {} if not aya_sajda_info else { "exist": (r["sajda"] == u"نعم"), "type": r["sajda_type"] if (r["sajda"] == u"نعم") else None, "id": N(r["sajda_id"]) if (r["sajda"] == u"نعم") else None, }, "annotations": {} if not annotation_aya or not annotations_by_position.has_key( (r["sura_id"], r["aya_id"])) else annotations_by_position[(r["sura_id"], r["aya_id"])] } return output
def _search_word(self, flags): """ return the results of word search as a dictionary data structure """ # flags query = flags["query"] if flags.has_key("query") \ else self._defaults["flags"]["query"] sortedby = flags["sortedby"] if flags.has_key("sortedby") \ else self._defaults["flags"]["sortedby"] range = int(flags["perpage"]) if flags.has_key("perpage") \ else flags["range"] if flags.has_key("range") \ else self._defaults["flags"]["range"] offset = ((int(flags["page"]) - 1) * range) + 1 if flags.has_key("page") \ else int(flags["offset"]) if flags.has_key("offset") \ else self._defaults["flags"]["offset"] romanization = flags["romanization"] if flags.has_key("romanization") \ else self._defaults["flags"]["romanization"] highlight = flags["highlight"] if flags.has_key("highlight") \ else self._defaults["flags"]["highlight"] script = flags["script"] if flags.has_key("script") \ else self._defaults["flags"]["script"] vocalized = IS_FLAG(flags, 'vocalized') view = flags["view"] if flags.has_key("view") \ else self._defaults["flags"]["view"] # pre-defined views if view == "minimal": vocalized = False aya = False elif view == "normal": pass elif view == "full": romanization = "iso" aya = True elif view == "statistic": pass elif view == "linguistic": romanization = "buckwalter" elif view == "recitation": script = "uthmani" else: # if view == custom or undefined aya = IS_FLAG(flags, 'aya') # preprocess query query = query.replace("\\", "") if not isinstance(query, unicode): query = unicode(query, 'utf8') if ":" not in query: query = unicode( transliterate("buckwalter", query, ignore="'_\"%*?#~[]{}:>+-|")) # Search SE = self.WSE res, termz, searcher = SE.search_all( query, self._defaults["results_limit"]["word"], sortedby=sortedby) terms = [ term[1] for term in list(termz)[:self._defaults["maxkeywords"]] ] # pagination offset = 1 if offset < 1 else offset range = self._defaults[ "minrange"] if range < self._defaults["minrange"] else range range = self._defaults[ "maxrange"] if range > self._defaults["maxrange"] else range interval_end = offset + range - 1 end = interval_end if interval_end < len(res) else len(res) start = offset if offset <= len(res) else -1 reslist = [] if end == 0 or start == -1 else list(res)[start - 1:end] # closing the searcher searcher.close() output = {} # if True: ## strip vocalization when vocalized = true V = QArabicSymbolsFilter( **{ 'shaping': True, 'tashkil': not vocalized, 'spellerrors': False, 'hamza': False, 'uthmani_symbols': True, }).normalize_all V_shadda = QArabicSymbolsFilter( **{ 'shaping': False, 'tashkil': False, 'spellerrors': False, 'hamza': False, 'shadda': True, 'uthmani_symbols': True }).normalize_all # highligh function that consider None value and non-definition H = lambda X: SE.highlight( X, terms, highlight ) if highlight != "none" and X else X if X else u"-----" extend_runtime = res.runtime # Words & Annotations words_output = {"individual": {}} if True: matches = 0 docs = 0 cpt = 1 for term in termz: if True: # term[0] == "normalized" or term[0] == "word": if term[2]: matches += term[2] docs += term[3] words_output["individual"][cpt] = { "field": term[0], "word": term[1], "romanization": transliterate( romanization, term[1], ignore="", reverse=True) if romanization in self.DOMAINS["romanization"] else None, "nb_matches": term[2], "nb_docs": term[3], } cpt += 1 words_output["global"] = { "nb_words": cpt - 1, "nb_matches": matches } output["keywords"] = words_output # Magic_loop to built queries of ayas,etc in the same time if aya: aya_query = u"( 0" for r in reslist: if aya: aya_query += u" OR ( sura_id:%s AND aya_id:%s ) " % ( unicode(r["sura_id"]), unicode(r["aya_id"])) aya_query += u" )" # original ayas if aya: aya_res, searcher = self.QSE.find_extended(aya_query, "gid") extend_runtime += aya_res.runtime aya_info = {} for ay in aya_res: if aya_info.has_key(ay["sura_id"]): aya_info[ay["sura_id"]][ay["aya_id"]] = ay else: aya_info[ay["sura_id"]] = {ay["aya_id"]: ay} searcher.close() output["runtime"] = round(extend_runtime, 5) output["interval"] = { "start": start, "end": end, "total": len(res), "page": ((start - 1) / range) + 1, "nb_pages": ((len(res) - 1) / range) + 1 } ### Words cpt = start - 1 output["words"] = {} for r in reslist: cpt += 1 output["words"][cpt] = { "identifier": { "gid": r["gid"], "word_gid": r["word_gid"], "aya_id": r["aya_id"], "sura_id": r["sura_id"], "word_id": r["word_id"], }, "word": { "text": r["word"], "part": u"جذع", "part_order": r["order"], "token": r["arabictoken"], "prefixes": r["prefix"], "suffixes": r["suffix"], "POS": { "english": r["pos"], "arabic": r["arabicpos"], }, "mood": { "english": r["mood"], "arabic": r["arabicmood"], }, "case": { "english": r["case"], "arabic": r["arabiccase"], }, "root": { # "english": r["root"], "arabic": r["arabicroot"], }, "lemma": { # "english": r["lemma"], "arabic": r["arabiclemma"], }, "special": { # "english": r["special"], "arabic": r["arabicspecial"], }, "derivation": r["derivation"], "form": r["form"], "gender": r["gender"], "person": r["person"], "number": r["number"], "voice": r["voice"], "state": r["state"], "aspect": r["aspect"], }, "aya": None if not aya \ else { "text": SE.highlight(aya_info[r["sura_id"]][r["aya_id"]]["uth_"], [r["word"]], highlight, False), "aya_id": aya_info[r["sura_id"]][r["aya_id"]]["aya_id"], "sura_name": aya_info[r["sura_id"]][r["aya_id"]]["sura"], "sura_arabic_name": aya_info[r["sura_id"]][r["aya_id"]]["sura_arabic"], }, } return output