Example #1
0
def search():
    '''
    Realiza una búsqueda de archivo
    '''
    # TODO: seguridad en param
    #si no se ha buscado nada se manda al inicio
    query = request.args.get("q", None)
    if not query:
        flash("write_something")
        return redirect(url_for("index.home"))

    page = int(request.args.get("page", 1))
    g.title = query+" - "+g.title
    results = {"total_found":0,"total":0,"time":0}

    didyoumean = None
    tags = None
    if 0 < page < 101:
        #obtener los tags y el quiso decir
        taming = taming_search(current_app.config, query, request.args.get("type", None), contextg=g._get_current_object())

        #obtener los resultados y sacar la paginación
        profiler.checkpoint(opening=["sphinx"])
        results = search_files(query,request.args,page) or results
        ids = get_ids(results)
        profiler.checkpoint(opening=["mongo"], closing=["sphinx"])
        files_dict = {mid2hex(file_data["_id"]):fill_data(file_data, False, query) for file_data in get_files(ids)}
        profiler.checkpoint(opening=["visited"], closing=["mongo"])
        save_visited(files_dict.values())
        profiler.checkpoint(closing=["visited"])
        files=(files_dict[bin2hex(file_id[0])] for file_id in ids if bin2hex(file_id[0]) in files_dict)

        # recupera los resultados del taming
        try:
            tags = taming.next()
            didyoumean = taming.next()
        except:
            pass
    else:
        files = ()

    return render_template('files/search.html',
        results=results,
        search=request.args["q"].split(" "),
        files=files,
        pagination=Pagination(page, 10, min(results["total_found"], 1000)),
        didyoumean=didyoumean,
        tags=tags)
Example #2
0
def search():
    '''
    Realiza una búsqueda de archivo
    '''

    # TODO: seguridad en param
    #si no se ha buscado nada se manda al inicio
    query = request.args.get("q", None)
    if not query:
        flash("write_something")
        return redirect(url_for("index.home"))

    #para evitar errores cuando en page no viene un número
    page = request.args.get("page", "1")
    if page.isdigit():
        page = int(page)
    else:
        abort(404)

    g.title = "%s - %s" % (query, g.title)
    results = {"total_found":0,"total":0,"time":0}

    didyoumean = None
    tags = None
    if 0 < page < 101:
        #obtener los tags y el quiso decir
        tags, dym = taming_search(query, request.args.get("type", None))

        #obtener los resultados y sacar la paginación
        profiler.checkpoint(opening=["sphinx"])
        results = search_files(query,request.args,page) or results
        ids = get_ids(results)
        profiler.checkpoint(opening=["mongo"], closing=["sphinx"])
        files_dict = {mid2hex(file_data["_id"]):fill_data(file_data, False, query) for file_data in get_files(ids)}
        profiler.checkpoint(opening=["visited"], closing=["mongo"])
        save_visited(files_dict.values())
        profiler.checkpoint(closing=["visited"])
        files=({"file":files_dict[bin2hex(file_id[0])], "search":file_id} for file_id in ids if bin2hex(file_id[0]) in files_dict)

        # recupera los resultados del taming
        try:
            tags = tags.next()
            didyoumean = dym.next()
        except:
            pass
    else:
        files = ()

    return render_template('files/search.html',
        results=results,
        search=request.args["q"].split(" "),
        files=files,
        pagination=Pagination(page, 10, min(results["total_found"], 1000)),
        didyoumean=didyoumean,
        tags=tags)
Example #3
0
File: files.py Project: Weej1/www
def process_search_results(s=None, query=None, category=None, not_category=None, title=None, zone="", last_items=[], skip=None, limit=70, max_limit=50, ignore_ids=[], show_order=True, results_template="results.html", details=False):
    files = []
    files_text = []
    files_dict = None
    results = None
    must_cache = True
    if not title:
        title = (None, 2, False)

    if s:
        ids = [result for result in ((bin2hex(fileid), server, sphinxid, weight, sg) for (fileid, server, sphinxid, weight, sg) in s.get_results((1.0, 0.1), last_items=last_items, skip=skip*max_limit if skip else None, min_results=limit, max_results=limit, extra_browse=limit, weight_processor=weight_processor, tree_visitor=tree_visitor, restart_if_skip=True)) if result[0] not in ignore_ids]

        # don't use all ids
        del ids[int(max_limit*1.1):]

        results_entities = list(set(int(aid[4])>>32 for aid in ids if int(aid[4])>>32))
        ntts = {int(ntt["_id"]):ntt for ntt in entitiesdb.get_entities(results_entities)} if results_entities else {}
        stats = s.get_stats()
        canonical_query = stats["ct"]

        if canonical_query:
            # elimina categoria y no categoria de la busqueda canonica
            canonical_query_parts = [part for part in canonical_query.split("_") if not ((not_category and part==u"-("+not_category+")")
                                                                                        or (category and part==u"("+category+")"))]

            canonical_query = "_".join(canonical_query_parts) if any(len(part)>=WORD_SEARCH_MIN_LEN or part in NGRAM_CHARS for part in canonical_query_parts) else ""

        sure = stats["s"]
        if (not sure) or ("total_sure" in stats and not stats["total_sure"]):
            g.must_cache = 0
            cache.cacheme = False
    else:
        sure = True
        canonical_query = ""

    # no realiza busquedas bloqueadas
    if canonical_query:
        #si la query exacta está en underage no se muestra nada
        safe_phrase = canonical_query.replace("_"," ").strip()
        #Si solo la incluye ya tiene que completar con misconduct
        prepared_phrase = blacklists.prepare_phrase(safe_phrase)

        if blacklists["underage"].exact(safe_phrase) or prepared_phrase in blacklists["forbidden"] or prepared_phrase in blacklists["searchblocked"] or (prepared_phrase in blacklists["misconduct"] and prepared_phrase in blacklists["underage"]):
            g.blacklisted_content = "Search"
            if not g.show_blacklisted_content and g.page_type in {SEARCH_PAGE_TYPE, CATEGORY_PAGE_TYPE} and not g.show_blacklisted_content:
                abort(404)

    # si la canonical query es vacia, solo interesan resultados para busquedas con query nulo (rankings)
    if (g.show_blacklisted_content or not g.blacklisted_content) and (canonical_query or not query):
        if ids:
            files_dict={str(f["_id"]):prepare_data(f,text=query,ntts=ntts,details=details, current_category=category) for f in get_files(ids,s)}

            if not g.search_bot:
                save_visited(files_dict.values())

            # ordena resultados y añade informacion de la busqueda
            position = 0
            for search_result in ids:
                fid = search_result[0]
                if fid in files_dict and files_dict[fid]:
                    afile = files_dict[fid]
                    afile["search"] = search_result
                    files.append(afile)
                    files_text.append(afile["view"]["nfn"])


                    featured_weight = (afile['view']["rating5"]
                                        + (10 if 'images_server' in afile['view'] or 'thumbnail' in afile['view'] else 0))

                    g.featured.append((-featured_weight, position, afile))
                    position+=1

            results = render_template(results_template, files=files[:max_limit or limit], list_title=title[0] or query or category, title_level=title[1], title_class=title[2], zone=zone, show_order=show_order)

        count = min(len(files), max_limit or limit)
        search_info = {"time": max(stats["t"].itervalues()) if stats["t"] else 0, "total_found": stats["cs"],
                   "count": count, "next": False if "end" in stats and stats["end"] or skip>=10 else (skip or 0)+1, "files_text":files_text, "canonical_query":canonical_query, "sure":sure}
    else:
        search_info = {"time": 0, "total_found": 0, "count": 0, "next": False, "files_text":[], "canonical_query":"-", "sure":sure}

    # intenta evitar problemas de memoria
    del files

    return results, search_info
Example #4
0
    def store_files(self, asearch, timeout, timeout_fallback):
        groups = self.filters_state["g"]
        subgroups = self.text_state["sg"]

        for server, query, sphinx_results, messages in self.proxy.browse_results(asearch, timeout, timeout_fallback):

            # permite manejar igual resultados de querys simples o de multiquerys
            if not sphinx_results:
                logging.error("Error in search thread:'%s'"%messages[1])
                continue
            elif "matches" in sphinx_results:
                sphinx_results = [sphinx_results]

            # comprueba si es una consulta de resumen
            if query["sf"]:
                main = [(True, False), (False, True)]
            elif query["st"]:
                main = [(True, True)]
            else:
                main = False

            # por defecto los valores son válidos
            valid = True

            # incorpora resultados al subgrupo que corresponda
            for result in sphinx_results:
                if not result:
                    logging.error("No ha llegado respuesta del servidor de búsquedas.")
                    continue
                    
                elif result["error"]:
                    logging.error("Error en búsqueda (servidor %d): %s" % (server, result["error"]))
                    continue
                    
                elif result["warning"]:
                    valid = False # los resultados se usan, pero se marcan como inválidos para próximas veces
                    logging.error("Alertas en búsqueda (servidor %d): %s" % (server, result["warning"]))

                total = 0
                for r in result["matches"]:
                    # calcula el subgrupo y el id del fichero
                    sg = str(r["attrs"]["g"])
                    fid = bin2hex(struct.pack('III',r["attrs"]["uri1"],r["attrs"]["uri2"],r["attrs"]["uri3"]))
                    g = get_group(sg)
                    g2 = get_group2(sg)
                    weight = r["attrs"]["vrw"]
                    count = r["attrs"]["@count"]
                    if not main: first = query["g"][g]
                    total += count
                    
                    # almacena fichero en grupos y subgrupos
                    if not fid in subgroups[sg]["f"]:
                        filtrable_info = {"z":r["attrs"]["z"], "e":r["attrs"]["e"]}
                        subgroups[sg]["f"][fid] = (weight, server, filtrable_info, r["id"])
                        # si aplica para los filtros
                        if self.satisfies_filters(sg, filtrable_info):
                            heapq.heappush(groups[g]["g2"][g2]["sg"][sg]["h"], (-weight, fid))
                    
                    # actualiza totales de grupos y subgrupos
                    if main[0][0]: # almacena en text_state
                        subgroups[sg]["c"][server] = count
                        subgroups[sg]["z"][0] = max(subgroups[sg]["z"][0], r["attrs"]["zm"])
                        subgroups[sg]["z"][1] = min(subgroups[sg]["z"][1], r["attrs"]["zx"])
                        
                    if main[0][1]: # almacena en filters_state
                        groups[g]["g2"][g2]["sg"][sg]["c"][server] = count

                    # actualiza el último registro vaĺido obtenido para el grupo en el servidor
                    if valid:
                        if main:
                            groups[g]["g2"][g2]["sg"][sg]["lv"][server] = max(1,groups[g]["g2"][g2]["sg"][sg]["lv"][server])
                        else:
                            groups[g]["g2"][g2]["sg"][sg]["lv"][server] = max(first+count,groups[g]["g2"][g2]["sg"][sg]["lv"][server])

                # totales absolutos
                if main:
                    if main[0][0]: # almacena en text_state
                        self.text_state["c"][server] = total
                        if valid and server in self.text_state["i"]: self.text_state["i"].remove(server)
                        elif not valid and server not in self.text_state["i"]: self.text_state["i"].append(server)
                        self.text_state["t"][server] = result["time"]
                    if main[0][1]: # almacena en filters_state
                        self.filters_state["c"][server] = total
                        if valid and server in self.filters_state["i"]: self.filters_state["i"].remove(server)
                        elif not valid and server not in self.filters_state["i"]: self.filters_state["i"].append(server)
                        self.filters_state["t"][server] = result["time"]
                    main.pop()
Example #5
0
    search_results["files"] = render_template('files/file.html',files=search_results["files"])
    return jsonify(search_results)

def search_files(query,filters,min_results=0,max_results=30,download=None,last_items=[],query_time=None,extra_wait_time=500, async=False, max_extra_searches=4, non_group=False, order=None, weight_processor=None, tree_visitor=None):
    '''
    Realiza una búsqueda de archivos
    '''
    if not last_items and min_results==0:
        min_results=5

    # obtener los resultados
    profiler_data={}
    profiler.checkpoint(profiler_data,opening=["sphinx"])

    s = searchd.search(query, filters=filters, start=not bool(last_items), group=True, no_group=non_group, order=order)
    ids = [(bin2hex(fileid), server, sphinxid, weight, sg) for (fileid, server, sphinxid, weight, sg) in s.get_results((1.4, 0.1), last_items=last_items, min_results=min_results, max_results=max_results, extra_browse=0 if max_results>30 else None, weight_processor=weight_processor, tree_visitor=tree_visitor)]

    stats = s.get_stats()

    profiler.checkpoint(profiler_data,opening=["entities"], closing=["sphinx"])

    results_entities = list(set(int(aid[4])>>32 for aid in ids if int(aid[4])>>32))
    ntts = {int(ntt["_id"]):ntt for ntt in entitiesdb.get_entities(results_entities)} if results_entities else {}
    profiler.checkpoint(profiler_data, closing=["entities"])
    '''# trae entidades relacionadas
    if ntts:
        rel_ids = list(set(eid for ntt in ntts.itervalues() for eids in ntt["r"].itervalues() if "r" in ntt for eid in eids))
        ntts.update({int(ntt["_id"]):ntt for ntt in entitiesdb.get_entities(rel_ids, None, (False, [u"episode"]))})
    '''

    result = {"time": max(stats["t"].itervalues()) if stats["t"] else 0, "total_found": stats["cs"]}
Example #6
0
def process_search_results(
    s=None,
    query=None,
    category=None,
    not_category=None,
    title=None,
    zone="",
    last_items=[],
    skip=None,
    limit=70,
    max_limit=50,
    ignore_ids=[],
    show_order=True,
):
    files = []
    files_text = []
    files_dict = None
    results = None
    must_cache = True
    if not title:
        title = (None, 2, False)

    if s:
        ids = [
            result
            for result in (
                (bin2hex(fileid), server, sphinxid, weight, sg)
                for (fileid, server, sphinxid, weight, sg) in s.get_results(
                    (3.0, 0.1),
                    last_items=last_items,
                    skip=skip * 100 if skip else None,
                    min_results=limit,
                    max_results=limit,
                    extra_browse=limit,
                    weight_processor=weight_processor,
                    tree_visitor=tree_visitor,
                )
            )
            if result[0] not in ignore_ids
        ]

        results_entities = list(set(int(aid[4]) >> 32 for aid in ids if int(aid[4]) >> 32))
        ntts = {int(ntt["_id"]): ntt for ntt in entitiesdb.get_entities(results_entities)} if results_entities else {}
        stats = s.get_stats()
        canonical_query = stats["ct"]

        if canonical_query:
            # elimina categoria y no categoria de la busqueda canonica
            canonical_query_parts = [
                part
                for part in canonical_query.split("_")
                if not (
                    (not_category and part == u"-(" + not_category + ")")
                    or (category and part == u"(" + category + ")")
                )
            ]

            canonical_query = (
                "_".join(canonical_query_parts)
                if any(len(part) >= WORD_SEARCH_MIN_LEN or part in NGRAM_CHARS for part in canonical_query_parts)
                else ""
            )

        sure = stats["s"]
        if (not sure) or ("total_sure" in stats and not stats["total_sure"]):
            g.must_cache = 0
            cache.cacheme = False
    else:
        sure = True
        canonical_query = ""

    # ~ ids= [("2a6a52f7ad943af97f57ee79","1",0,0,0),("fd83615ca1e57647491b3744","1",0,0,0)]
    # ~ ids= ["2a6a52f7ad943af97f57ee79", "fd83615ca1e57647491b3744"]
    data_filtered = get_data_filtered()
    ids = [item[0] for item in data_filtered]

    filtered = {item[0]: {"query": item[1], "blocked": item[2]} for item in data_filtered}
    stats = {"cs": 133}
    ntts = {}

    # no realiza busquedas bloqueadas
    if canonical_query:
        prepared_phrase = blacklists.prepare_phrase(canonical_query.replace("_", " "))
        if (
            prepared_phrase in blacklists["forbidden"]
            or prepared_phrase in blacklists["searchblocked"]
            or (prepared_phrase in blacklists["misconduct"] and prepared_phrase in blacklists["underage"])
        ):
            g.blacklisted_content = True

    # si la canonical query es vacia, solo interesan resultados para busquedas con query nulo (rankings)
    if (g.show_blacklisted_content or not g.blacklisted_content) and (canonical_query or not query):
        if ids:
            files_dict = {str(f["_id"]): prepare_data(f, text=query, ntts=ntts) for f in filesdb.get_files(ids, s, 1)}

            # ordena resultados y añade informacion de la busqueda
            position = 0
            for search_result in ids:
                fid = search_result
                if fid in files_dict and files_dict[fid]:
                    afile = files_dict[fid]
                    afile["search"] = search_result
                    files.append(afile)
                    files_text.append(afile["view"]["nfn"])

                    afile["view"]["blocked"] = filtered[str(afile["file"]["_id"])]["blocked"]
                    afile["view"]["query"] = filtered[str(afile["file"]["_id"])]["query"]

                    featured_weight = afile["view"]["rating"] + (
                        10 if "images_server" in afile["view"] or "thumbnail" in afile["view"]["md"] else 0
                    )

                    g.featured.append((-featured_weight, position, afile))

                    position -= 1

            results = render_template(
                "filters_results.html",
                files=files,
                list_title=title[0] or query or category,
                title_level=title[1],
                title_class=title[2],
                zone=zone,
                show_order=show_order,
            )

        count = min(len(files), max_limit or limit)
        search_info = {
            "time": 0,
            "total_found": stats["cs"],
            "count": count,
            "next": False,
            "files_text": files_text,
            "canonical_query": canonical_query,
            "sure": sure,
        }
    else:
        search_info = {
            "time": 0,
            "total_found": 0,
            "count": 0,
            "next": False,
            "files_text": [],
            "canonical_query": "-",
            "sure": sure,
        }

    # intenta evitar problemas de memoria
    del files

    return results, search_info