def derivation(word, leveldist): """ search in defined field """ # define source level index if word in derivedict["word_"]: indexsrc = "word_" elif word in derivedict["lemma"]: indexsrc = "lemma" elif word in derivedict["root"]: indexsrc = "root" else: indexsrc = None # warning # define destination level index if leveldist == 0: indexdist = "word_" elif leveldist == 1: indexdist = "lemma" elif leveldist == 2: indexdist = "root" else: indexdist = "root" # new levels lst = [] if indexsrc: # if index source level is defined itm = LOCATE(derivedict[indexsrc], derivedict[indexdist], word) if itm: # if different of none lst = FILTER_DOUBLES( FIND(derivedict[indexdist], derivedict["word_"], itm)) else: lst = [word] return lst
def tuple(props): """ search the words that have the specific properties """ wset = set() firsttime = True for propkey in props.keys(): if worddict.has_key(propkey): partial_wset = set( FIND(worddict[propkey], worddict["word_"], props[propkey])) if firsttime: wset = partial_wset firsttime = False else: wset &= partial_wset else: # property has now index pass return list(wset)
def _search_aya(self, flags): """ return the results of aya search as a dictionary data structure """ # flags query = flags["query"] if flags.has_key("query") \ else self._defaults["flags"]["query"] sortedby = flags["sortedby"] if flags.has_key("sortedby") \ else self._defaults["flags"]["sortedby"] range = int(flags["perpage"]) if flags.has_key("perpage") \ else flags["range"] if flags.has_key("range") \ else self._defaults["flags"]["range"] ## offset = (page-1) * perpage -- mode paging offset = ((int(flags["page"]) - 1) * range) + 1 if flags.has_key("page") \ else int(flags["offset"]) if flags.has_key("offset") \ else self._defaults["flags"]["offset"] recitation = flags["recitation"] if flags.has_key("recitation") \ else self._defaults["flags"]["recitation"] translation = flags["translation"] if flags.has_key("translation") \ else self._defaults["flags"]["translation"] romanization = flags["romanization"] if flags.has_key("romanization") \ else self._defaults["flags"]["romanization"] highlight = flags["highlight"] if flags.has_key("highlight") \ else self._defaults["flags"]["highlight"] script = flags["script"] if flags.has_key("script") \ else self._defaults["flags"]["script"] vocalized = IS_FLAG(flags, 'vocalized') fuzzy = IS_FLAG(flags, 'fuzzy') view = flags["view"] if flags.has_key("view") \ else self._defaults["flags"]["view"] # pre-defined views if view == "minimal": # fuzzy = True # page = 25 vocalized = False recitation = None translation = None prev_aya = next_aya = False sura_info = False word_info = False word_synonyms = False word_derivations = False word_vocalizations = False aya_position_info = aya_theme_info = aya_sajda_info = False aya_stat_info = False sura_stat_info = False annotation_aya = annotation_word = False elif view == "normal": prev_aya = next_aya = True sura_info = True word_info = True word_synonyms = False word_derivations = True word_vocalizations = True aya_position_info = aya_theme_info = aya_sajda_info = True aya_stat_info = True sura_stat_info = False annotation_aya = annotation_word = False elif view == "full": prev_aya = next_aya = True sura_info = True word_info = True word_synonyms = True word_derivations = True word_vocalizations = True aya_position_info = aya_theme_info = aya_sajda_info = True aya_stat_info = sura_stat_info = True annotation_aya = annotation_word = True romanization = "iso" elif view == "statistic": prev_aya = next_aya = False sura_info = True word_info = True word_synonyms = False word_derivations = True word_vocalizations = True aya_position_info = True aya_theme_info = aya_sajda_info = False aya_stat_info = True sura_stat_info = True annotation_aya = False annotation_word = False elif view == "linguistic": prev_aya = next_aya = False sura_info = False word_info = True word_synonyms = True word_derivations = True word_vocalizations = True aya_position_info = False aya_theme_info = aya_sajda_info = True aya_stat_info = False sura_stat_info = False annotation_aya = True annotation_word = True romanization = "buckwalter" elif view == "recitation": script = "uthmani" prev_aya = next_aya = True sura_info = True word_info = False word_synonyms = False word_derivations = False word_vocalizations = False aya_position_info = True aya_theme_info = False aya_sajda_info = True aya_stat_info = False sura_stat_info = False annotation_aya = False annotation_word = False else: # if view == custom or undefined prev_aya = IS_FLAG(flags, 'prev_aya') next_aya = IS_FLAG(flags, 'next_aya') sura_info = IS_FLAG(flags, 'sura_info') sura_stat_info = IS_FLAG(flags, 'sura_stat_info') word_info = IS_FLAG(flags, 'word_info') word_synonyms = IS_FLAG(flags, 'word_synonyms') word_derivations = IS_FLAG(flags, 'word_derivations') word_vocalizations = IS_FLAG(flags, 'word_vocalizations') aya_position_info = IS_FLAG(flags, 'aya_position_info') aya_theme_info = IS_FLAG(flags, 'aya_theme_info') aya_stat_info = IS_FLAG(flags, 'aya_stat_info') aya_sajda_info = IS_FLAG(flags, 'aya_sajda_info') annotation_aya = IS_FLAG(flags, 'annotation_aya') annotation_word = IS_FLAG(flags, 'annotation_word') # print query # preprocess query query = query.replace("\\", "") if not isinstance(query, unicode): query = unicode(query, 'utf8') if ":" not in query: query = unicode( transliterate("buckwalter", query, ignore="'_\"%*?#~[]{}:>+-|")) # Search SE = self.FQSE if fuzzy else self.QSE res, termz, searcher = SE.search_all( query, self._defaults["results_limit"]["aya"], sortedby=sortedby) terms = [ term[1] for term in list(termz)[:self._defaults["maxkeywords"]] ] terms_uthmani = map(STANDARD2UTHMANI, terms) # pagination offset = 1 if offset < 1 else offset range = self._defaults[ "minrange"] if range < self._defaults["minrange"] else range range = self._defaults[ "maxrange"] if range > self._defaults["maxrange"] else range interval_end = offset + range - 1 end = interval_end if interval_end < len(res) else len(res) start = offset if offset <= len(res) else -1 reslist = [] if end == 0 or start == -1 else list(res)[start - 1:end] # todo pagination should be done inside search operation for better performence # closing the searcher searcher.close() output = {} ## disable annotations for aya words if there is more then one result if annotation_aya and len(res) > 1: annotation_aya = False # if True: ## strip vocalization when vocalized = true V = QArabicSymbolsFilter( \ shaping=False, \ tashkil=not vocalized, \ spellerrors=False, \ hamza=False \ ).normalize_all strip_vocalization = QArabicSymbolsFilter( \ shaping=False, \ tashkil=True, \ spellerrors=False, \ hamza=False \ ).normalize_all # highligh function that consider None value and non-definition H = lambda X: self.QSE.highlight( X, terms, highlight ) if highlight != "none" and X else X if X else u"-----" # Numbers are 0 if not defined N = lambda X: X if X else 0 # parse keywords lists , used for Sura names kword = re.compile(u"[^,،]+") keywords = lambda phrase: kword.findall(phrase) ########################################## extend_runtime = res.runtime # Words & Annotations words_output = {"individual": {}} if word_info: matches = 0 docs = 0 nb_vocalizations_globale = 0 cpt = 1 annotation_word_query = u"( 0 " for term in termz: if term[0] == "aya" or term[0] == "aya_": if term[2]: matches += term[2] docs += term[3] if term[0] == "aya_": annotation_word_query += u" OR word:%s " % term[1] else: # if aya annotation_word_query += u" OR normalized:%s " % STANDARD2UTHMANI( term[1]) if word_vocalizations: vocalizations = vocalization_dict[strip_vocalization(term[1])] if vocalization_dict.has_key( strip_vocalization(term[1])) \ else [] nb_vocalizations_globale += len(vocalizations) if word_synonyms: synonyms = syndict[term[1]] if syndict.has_key(term[1]) \ else [] derivations_extra = [] if word_derivations: lemma = LOCATE(derivedict["word_"], derivedict["lemma"], term[1]) if lemma: # if different of none derivations = FILTER_DOUBLES( FIND(derivedict["lemma"], derivedict["word_"], lemma)) else: derivations = [] # go deeper with derivations root = LOCATE(derivedict["word_"], derivedict["root"], term[1]) if root: # if different of none derivations_extra = list( set( FILTER_DOUBLES( FIND(derivedict["root"], derivedict["word_"], lemma))) - set(derivations)) words_output["individual"][cpt] = { "word": term[1], "romanization": transliterate( romanization, term[1], ignore="", reverse=True) if romanization in self.DOMAINS["romanization"] else None, "nb_matches": term[2], "nb_ayas": term[3], "nb_vocalizations": len(vocalizations) if word_vocalizations else 0, # unneeded "vocalizations": vocalizations if word_vocalizations else [], "nb_synonyms": len(synonyms) if word_synonyms else 0, # unneeded "synonyms": synonyms if word_synonyms else [], "lemma": lemma if word_derivations else "", "root": root if word_derivations else "", "nb_derivations": len(derivations) if word_derivations else 0, # unneeded "derivations": derivations if word_derivations else [], "nb_derivations_extra": len(derivations_extra), "derivations_extra": derivations_extra, } cpt += 1 annotation_word_query += u" ) " words_output["global"] = { "nb_words": cpt - 1, "nb_matches": matches, "nb_vocalizations": nb_vocalizations_globale } output["words"] = words_output # Magic_loop to built queries of Adjacents,translations and annotations in the same time if prev_aya or next_aya or translation or annotation_aya: adja_query = trad_query = annotation_aya_query = u"( 0" for r in reslist: if prev_aya: adja_query += u" OR gid:%s " % unicode(r["gid"] - 1) if next_aya: adja_query += u" OR gid:%s " % unicode(r["gid"] + 1) if translation: trad_query += u" OR gid:%s " % unicode(r["gid"]) if annotation_aya: annotation_aya_query += u" OR ( aya_id:%s AND sura_id:%s ) " % ( unicode(r["aya_id"]), unicode(r["sura_id"])) adja_query += u" )" trad_query += u" )" + u" AND id:%s " % unicode(translation) annotation_aya_query += u" )" # Adjacents if prev_aya or next_aya: adja_res, searcher = self.QSE.find_extended(adja_query, "gid") adja_ayas = { 0: { "aya_": u"----", "uth_": u"----", "sura": u"---", "aya_id": 0, "sura_arabic": u"---" }, 6237: { "aya_": u"----", "uth_": u"----", "sura": u"---", "aya_id": 9999, "sura_arabic": u"---" } } for adja in adja_res: adja_ayas[adja["gid"]] = { "aya_": adja["aya_"], "uth_": adja["uth_"], "aya_id": adja["aya_id"], "sura": adja["sura"], "sura_arabic": adja["sura_arabic"] } extend_runtime += adja_res.runtime searcher.close() # translations if translation: trad_res, searcher = self.TSE.find_extended(trad_query, "gid") extend_runtime += trad_res.runtime trad_text = {} for tr in trad_res: trad_text[tr["gid"]] = tr["text"] searcher.close() # annotations for aya words if annotation_aya or (annotation_word and word_info): annotation_word_query = annotation_word_query if annotation_word and word_info else u"()" annotation_aya_query = annotation_aya_query if annotation_aya else u"()" annotation_query = annotation_aya_query + u" OR " + annotation_word_query # print annotation_query.encode( "utf-8" ) annot_res, searcher = self.WSE.find_extended( annotation_query, "gid") extend_runtime += annot_res.runtime ## prepare annotations for use annotations_by_word = {} annotations_by_position = {} for annot in annot_res: if (annotation_word and word_info): if annot["normalized"] in terms_uthmani: if annotations_by_word.has_key(annot["normalized"]): if annotations_by_word[ annot["normalized"]].has_key( annot["word"]): annotations_by_word[annot["normalized"]][ annot["word"]].append(annot) else: annotations_by_word[annot["normalized"]][ annot["word"]] = [annot] else: annotations_by_word[annot["normalized"]] = { annot["word"]: [annot] } if annotation_aya: if annotations_by_position.has_key( (annot["sura_id"], annot["aya_id"])): annotations_by_position[( annot["sura_id"], annot["aya_id"])][annot["word_id"]] = annot else: annotations_by_position[(annot["sura_id"], annot["aya_id"])] = { annot["word_id"]: annot } searcher.close() ## merge word annotations to word output if (annotation_word and word_info): for cpt in xrange(1, len(output["words"]["individual"]) + 1): current_word = STANDARD2UTHMANI( output["words"]["individual"][cpt]["word"]) # print current_word.encode( "utf-8" ), "=>", annotations_by_word, "=>", list( annot_res ) if annotations_by_word.has_key(current_word): current_word_annotations = annotations_by_word[ current_word] output["words"]["individual"][cpt][ "annotations"] = current_word_annotations output["words"]["individual"][cpt]["nb_annotations"] = len( current_word_annotations) output["runtime"] = round(extend_runtime, 5) output["interval"] = { "start": start, "end": end, "total": len(res), "page": ((start - 1) / range) + 1, "nb_pages": ((len(res) - 1) / range) + 1 } output["translation_info"] = {} ### Ayas cpt = start - 1 output["ayas"] = {} for r in reslist: cpt += 1 output["ayas"][cpt] = { "identifier": {"gid": r["gid"], "aya_id": r["aya_id"], "sura_id": r["sura_id"], "sura_name": keywords(r["sura"])[0], "sura_arabic_name": keywords(r["sura_arabic"])[0], }, "aya": { "id": r["aya_id"], "text": H(V(r["aya_"])) if script == "standard" else H(r["uth_"]), "text_no_highlight": r["aya"] if script == "standard" else r["uth_"], "translation": trad_text[r["gid"]] if ( translation != "None" and translation and trad_text.has_key(r["gid"])) else None, "recitation": None if not recitation or not self._recitations.has_key(recitation) \ else u"https://www.everyayah.com/data/" + self._recitations[recitation]["subfolder"].encode( "utf-8") + "/%03d%03d.mp3" % (r["sura_id"], r["aya_id"]), "prev_aya": { "id": adja_ayas[r["gid"] - 1]["aya_id"], "sura": adja_ayas[r["gid"] - 1]["sura"], "sura_arabic": adja_ayas[r["gid"] - 1]["sura_arabic"], "text": V(adja_ayas[r["gid"] - 1]["aya_"]) if script == "standard" else adja_ayas[r["gid"] - 1]["uth_"], } if prev_aya else None , "next_aya": { "id": adja_ayas[r["gid"] + 1]["aya_id"], "sura": adja_ayas[r["gid"] + 1]["sura"], "sura_arabic": adja_ayas[r["gid"] + 1]["sura_arabic"], "text": V(adja_ayas[r["gid"] + 1]["aya_"]) if script == "standard" else adja_ayas[r["gid"] + 1]["uth_"], } if next_aya else None , }, "sura": {} if not sura_info else { "name": keywords(r["sura"])[0], "arabic_name": keywords(r["sura_arabic"])[0], "english_name": keywords(r["sura_english"])[0], "id": r["sura_id"], "type": r["sura_type"], "arabic_type": r["sura_type_arabic"], "order": r["sura_order"], "ayas": r["s_a"], "stat": {} if not sura_stat_info else { "words": N(r["s_w"]), "godnames": N(r["s_g"]), "letters": N(r["s_l"]) } }, "position": {} if not aya_position_info else { "manzil": r["manzil"], "juz": r["juz"], "hizb": r["hizb"], "rub": r["rub"] % 4, "page": r["page"], "page_IN": r["page_IN"], "ruku": r["ruku"], }, "theme": {} if not aya_theme_info else { "chapter": r["chapter"], "topic": r["topic"], "subtopic": r["subtopic"] }, "stat": {} if not aya_stat_info else { "words": N(r["a_w"]), "letters": N(r["a_l"]), "godnames": N(r["a_g"]) }, "sajda": {} if not aya_sajda_info else { "exist": (r["sajda"] == u"نعم"), "type": r["sajda_type"] if (r["sajda"] == u"نعم") else None, "id": N(r["sajda_id"]) if (r["sajda"] == u"نعم") else None, }, "annotations": {} if not annotation_aya or not annotations_by_position.has_key( (r["sura_id"], r["aya_id"])) else annotations_by_position[(r["sura_id"], r["aya_id"])] } return output