def search_related(phrases): ''' Busqueda de archivos relacionados ''' sph = sphinxapi.SphinxClient() sph.SetServer(current_app.config["SERVICE_SPHINX"], current_app.config["SERVICE_SPHINX_PORT"]) sph.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2) sph.SetRankingMode(sphinxapi.SPH_RANK_SPH04) sph.SetFieldWeights({"fn1":100}) sph.SetSelect("*, idiv(@weight,10000) as sw") sph.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, "w DESC, sw DESC, ls DESC") sph.SetMaxQueryTime(current_app.config["SERVICE_SPHINX_MAX_QUERY_TIME"]) sph.SetLimits( 0, 6, 6, 10000) sph.SetFilter('bl', [0]) sph.SetFilter('t', [int(i["_id"]) for i in filesdb.get_sources()]) minlen = float("inf") for phrase in phrases: words = [word for word in phrase if len(word)>1] minlen = min(len(words),minlen) sph.AddQuery(" ".join(words), "idx_files") # añade busquedas más cortas if minlen > 4 and phrases: words = [word for word in phrases[0] if len(word)>1] sph.AddQuery(" ".join(words[0:3]), "idx_files") sph.AddQuery(" ".join(words[-3:]), "idx_files") query = sph.RunQueries() or [] sph.Close() return query
def search_files(query, filters, page=1): ''' Busqueda simple de archivos con filtros ''' sph = sphinxapi.SphinxClient() sph.SetServer(current_app.config["SERVICE_SPHINX"], current_app.config["SERVICE_SPHINX_PORT"]) sph.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED2) sph.SetRankingMode(sphinxapi.SPH_RANK_SPH04) sph.SetFieldWeights({"fn1":100}) sph.SetSelect("*, idiv(@weight,10000) as sw") sph.SetSortMode( sphinxapi.SPH_SORT_EXTENDED, "w DESC, sw DESC, ls DESC" ) sph.SetMaxQueryTime(current_app.config["SERVICE_SPHINX_MAX_QUERY_TIME"]) sph.SetLimits((page-1)*10, 10, 1000, 2000000) sph.ResetFilters() sph.SetFilter('bl', [0]) #todos los filtros posibles de busqueda if 'type' in filters and filters["type"] and filters["type"] in current_app.config["CONTENTS_CATEGORY"]: sph.SetFilter('ct', current_app.config["CONTENTS_CATEGORY"][filters["type"]]) if 'src' in filters and filters["src"]: sph.SetFilter('t', [int(i["_id"]) for i in filesdb.get_sources(group=tuple(filters['src']))]) else: sph.SetFilter('t', [int(i["_id"]) for i in filesdb.get_sources()]) if 'size' in filters and filters["size"]: if int(filters['size'])<4: sph.SetFilterRange('z', 1, 1048576*(10**(int(filters['size'])-1)), False) else: sph.SetFilterRange('z', 0, 104857600, True) if 'brate' in filters and filters["brate"]: sph.SetFilterRange('mab', 0, [127,191,255,319][int(filters['brate'])-1], True) if 'year' in filters and filters["year"]: sph.SetFilterRange('may', [0,60,70,80,90,100,datetime.utcnow().year-1][int(filters['year'])-1], [59,69,79,89,99,109,datetime.utcnow().year][int(filters['year'])-1]) query = sph.Query(query, "idx_files") sph.Close() if query: if current_app.debug and query["warning"]: logging.warn(query["warning"]) if query["error"]: logging.error(query["error"]) return query
def search_related(phrases): """ Busqueda de archivos relacionados """ if not phrases: return [] sph = sphinxapi2.SphinxClient() sph.SetServer(current_app.config["SERVICE_SPHINX"], current_app.config["SERVICE_SPHINX_PORT"]) sph.SetConnectTimeout(current_app.config["SERVICE_SPHINX_CONNECT_TIMEOUT"]) sph.SetMatchMode(sphinxapi2.SPH_MATCH_EXTENDED2) sph.SetFieldWeights({"fn": 100, "md": 1}) sph.SetRankingMode(sphinxapi2.SPH_RANK_EXPR, "sum((2.0*lcs/min_best_span_pos)*user_weight)") sph.SetSortMode(sphinxapi2.SPH_SORT_EXTENDED, "r DESC, r2 DESC, uri1 DESC") sph.SetMaxQueryTime(current_app.config["SERVICE_SPHINX_MAX_QUERY_TIME"]) sph.SetLimits(0, 6, 6, 10000) sph.SetFilter("bl", [0]) sph.SetFilter("s", [int(i["_id"]) for i in filesdb.get_sources()]) if phrases[-1] in EXTENSIONS: phrases.pop() phrases.sort(key=len, reverse=True) for phrase in phrases[:5]: sph.AddQuery(sph.EscapeString(phrase), "idx_files") querys = sph.RunQueries() or [] warn = error = [] if querys: for query_res in querys: if query_res["warning"]: warn.append(query_res["warning"]) if query_res["error"]: error.append(query_res["error"]) else: warn = sph.GetLastWarning() error = sph.GetLastError() if warn: logging.warn( "Warning on a Sphinx response", extra={"method": "search_related", "q": phrases[:5], "orig_msg": warn} ) if error: logging.error( "Error on a Sphinx response", extra={"method": "search_related", "q": phrases[:5], "orig_msg": error} ) sph.Close() return querys
def search_files(query, filters, page=1): """ Busqueda simple de archivos con filtros """ sph = sphinxapi2.SphinxClient() sph.SetServer(current_app.config["SERVICE_SPHINX"], current_app.config["SERVICE_SPHINX_PORT"]) sph.SetConnectTimeout(current_app.config["SERVICE_SPHINX_CONNECT_TIMEOUT"]) sph.SetMatchMode(sphinxapi2.SPH_MATCH_EXTENDED2) sph.SetFieldWeights({"fn": 100, "md": 1}) sph.SetSelect("*, %s" % normalize_weights) sph.SetRankingMode(sphinxapi2.SPH_RANK_EXPR, "sum((10.0*lcs+1.0/min_best_span_pos)*user_weight)") sph.SetSortMode(sphinxapi2.SPH_SORT_EXTENDED, "wr DESC, r2 DESC, uri1 DESC") sph.SetMaxQueryTime(current_app.config["SERVICE_SPHINX_MAX_QUERY_TIME"]) sph.SetLimits((page - 1) * 10, 10, 1000, 2000000) sph.ResetFilters() sph.SetFilter("bl", [0]) # todos los filtros posibles de busqueda try: if "type" in filters and filters["type"]: print "ct", [ tl for t in filters["type"].split("|") if t in CONTENTS_CATEGORY for tl in CONTENTS_CATEGORY[t] ] sph.SetFilter( "ct", [tl for t in filters["type"].split("|") if t in CONTENTS_CATEGORY for tl in CONTENTS_CATEGORY[t]] ) except: logging.warn("Wrong data for type filter.", extra={"q": query, "filters": filters}) if "src" in filters and filters["src"]: try: print "sources", [int(i["_id"]) for i in filesdb.get_sources(group=tuple(filters["src"]))] sph.SetFilter("s", [int(i["_id"]) for i in filesdb.get_sources(group=tuple(filters["src"]))]) except: logging.warn("Wrong data for source filter.", extra={"q": query, "filters": filters}) else: sph.SetFilter("s", [int(i["_id"]) for i in filesdb.get_sources()]) if "size" in filters: try: if filters["size"].isdigit() and int(filters["size"]) > 0 and int(filters["size"]) <= 4: if int(filters["size"]) < 4: sph.SetFilterFloatRange("z", 1.0, log(1048576 * (10 ** (int(filters["size"]) - 1)), 2), False) else: sph.SetFilterFloatRange("z", 0.0, log(104857600, 2), True) else: sizes = filters["size"].split(",") if len(sizes) == 2: sph.SetFilterFloatRange("z", float(sizes[0]), float(sizes[1]), False) except: logging.warn("Wrong data for size filter.", extra={"q": query, "filters": filters}) """if 'brate' in filters and filters["brate"].isdigit() and int(filters['brate'])>0 and int(filters['brate'])<=4: sph.SetFilterRange('mab', 0, [127,191,255,319][int(filters['brate'])-1], True) if 'year' in filters and filters["year"].isdigit() and int(filters['year'])>0 and int(filters['year'])<=7: sph.SetFilterRange('may', [0,60,70,80,90,100,datetime.utcnow().year-1][int(filters['year'])-1], [59,69,79,89,99,109,datetime.utcnow().year][int(filters['year'])-1]) """ query_res = sph.Query(sph.EscapeString(query), "idx_files") warn = error = None if query_res: if query_res["warning"]: warn = query_res["warning"] if query_res["error"]: error = query_res["error"] else: warn = sph.GetLastWarning() error = sph.GetLastError() if warn: logging.warn("Warning on a Sphinx response", extra={"method": "search_files", "q": query, "orig_msg": warn}) if error: logging.error("Error on a Sphinx response", extra={"method": "search_files", "q": query, "orig_msg": error}) if warn or error: cache.cacheme = False # evita que se cacheen respuestas con fallos sph.Close() return query_res
def init_search_stats(): global normalize_weights source_weights = {"w": 1, "s": 1, "t": 0.5, "e": 0.08, "g": 0.08} sources_weights = {int(s["_id"]): v for k, v in source_weights.iteritems() for s in filesdb.get_sources(group=k)} sources_weights[18] /= 1.8 iclogs = { str(s): sources_weights.get(s, 1.0) / (1.0 + log(searchd.proxy.sources_rating_rc[s] + 1)) for s in searchd.proxy.sources.iterkeys() if s in searchd.proxy.sources_rating_ra } avgs = { str(s): searchd.proxy.sources_rating_ra[s] for s in searchd.proxy.sources.iterkeys() if s in searchd.proxy.sources_rating_ra } devs = { str(s): ((searchd.proxy.sources_rating_rd[s] if s in searchd.proxy.sources_rating_rd else 1.0) - 1.0) for s in searchd.proxy.sources_rating_rd.iterkeys() } avgs_vals = "+".join( "IN(s,%s)*%f" % (",".join(v), k) for k, v in groupby(sorted(avgs, key=avgs.get), key=avgs.get) if not -1e-8 < k < 1e-8 ) devs_vals = "+".join( "IN(s,%s)*%f" % (",".join(v), k) for k, v in groupby(sorted(devs, key=devs.get), key=devs.get) if not -1e-8 < k < 1e-8 ) iclog_vals = "+".join( "IN(s,%s)*%f" % (",".join(v), k) for k, v in groupby(sorted(iclogs, key=iclogs.get), key=iclogs.get) if not -1e-8 < k - 1 < 1e-8 ) normalize_weights = ( "@weight*(%(iclog)s)*(0.4+(if(r>-1,r-%(avg)s,0))/(1.0+%(dev)s)) as wr, %(iclog)s as riclog, %(dev)s as rdev, %(avg)s as ravg" % {"iclog": iclog_vals, "avg": avgs_vals, "dev": devs_vals} )