def run_search(dict_file, postings_file, queries_file, results_file):
    """Using the given dictionary file and postings impt_wordsearching on
       the given queries file and output the results to a file

    Args:
        dict_file: The dictionary filename
        postings_file: The postings file filename
        queries_file: The query filename
        results_file: The filename to write our output
    """
    print('running search on the queries...')

    infile = open(dict_file, 'rb')
    new_dict = pickle.load(infile)
    infile.close()

    in_file = open(queries_file, 'r', encoding="utf8")
    out_file = open(results_file, 'w', encoding="utf8")
    posting_file = open(postings_file, 'rb')
    query_list = in_file.read().splitlines()

    while query_list:
        query = query_list.pop(0)
        if (not query):
            out_file.write("")
        else:
            out_file.write(process_query(query, new_dict, posting_file, use_prf=USE_PRF))
        
        if query_list:
            out_file.write('\n')

    in_file.close()
    out_file.close()
Example #2
0
def run(args):
    if args.build:
        model = build(args)

    if args.best:
        args.rocchio = True

    try:
        print('loading model from %s' % args.load_model)
        model = VSM(model_path=args.load_model)
    except:
        print('failed to load model, build from raw.')
        model = build(args)

    query_list, query_id = process_query(args.query_file)
    output_file = open(args.ranked_list, 'w+')
    print('query_id,retrieved_docs', file=output_file)
    for i, query in enumerate(query_list):
        doc_id, doc_score = model.get_ranking(query, args.rocchio)
        print('%s,%s' % (query_id[i], ' '.join(doc_id[:100])),
              file=output_file)
Example #3
0
def run():
    import pandas as pd
    model = VSM(model_path='./model.pl')
    query_list, query_id = process_query('../queries/query-train.xml')
    answer = pd.read_csv('../queries/ans_train.csv')[['retrieved_docs']].values
    answer = [a[0].strip().split(' ') for a in answer]

    for p in product_dict(params): 
        model.k1 = p['k1']
        model.b = p['b']
        model.k3 = p['k3']
        model.tf_type = p['tf']
        model.idf_type = p['idf']
        model.doc_len_norm = p['norm']
        model._compute_idf()
        
        score = []
        for i, query in enumerate(query_list):
            doc_id, doc_score = model.get_ranking(query, p['rocchio'], p['n'], p['k'])
            score.append(MAP(doc_id[:100], answer[i]))
        score = sum(score) / len(score)
        model_str = 'k1=%.2f,b=%.2f,k3=%d,idf_type=%s,rocchio=%s,n=%d,k=%d,score=%.5f' % (p['k1'], p['b'], p['k3'], p['idf'], p['rocchio'], p['n'], p['k'], score)
        print(model_str)
Example #4
0
def query_api(version=1):
    """ Respond to a query string """

    if not (1 <= version <= 1):
        return better_jsonify(valid=False, reason="Unsupported version")

    # String with query
    q = request.values.get("q", "")
    # q param contains one or more |-separated strings
    mq = q.split("|")[0:_MAX_QUERY_VARIANTS]
    # Retain only nonempty strings in q
    q = list(filter(None, (m.strip()[0:_MAX_QUERY_LENGTH] for m in mq)))

    # If voice is set, return a voice-friendly string
    voice = bool_from_request(request, "voice")
    # Request a particular voice
    voice_id = request.values.get("voice_id")
    # If test is set to True (which is only possible in a debug setting), we
    # (1) add a synthetic location, if not given; and
    # (2) bypass the cache
    test = Settings.DEBUG and bool_from_request(request, "test")

    # Obtain the client's location, if present
    lat = request.values.get("latitude")
    lon = request.values.get("longitude")

    # Additional client info
    client_id = request.values.get("client_id")
    client_type = request.values.get("client_type")
    client_version = request.values.get("client_version")
    # When running behind an nginx reverse proxy, the client's remote
    # address is passed to the web application via the "X-Real-IP" header
    client_ip = request.remote_addr or request.headers.get("X-Real-IP")

    # Query is marked as private and shouldn't be logged
    private = bool_from_request(request, "private")

    # Attempt to convert the (lat, lon) location coordinates to floats
    location_present = bool(lat) and bool(lon)

    # For testing, insert a synthetic location if not already present
    if not location_present and test:
        lat, lon = _MIDEIND_LOCATION
        location_present = True

    if location_present:
        try:
            lat = float(lat)
            if not (-90.0 <= lat <= 90.0):
                location_present = False
        except ValueError:
            location_present = False

    if location_present:
        try:
            lon = float(lon)
            if not (-180.0 <= lon <= 180.0):
                location_present = False
        except ValueError:
            location_present = False

    # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON
    auto_uppercase = bool_from_request(request, "autouppercase", True)

    # Send the query to the query processor
    result = process_query(
        q,
        voice,
        auto_uppercase=auto_uppercase,
        location=(lat, lon) if location_present else None,
        remote_addr=client_ip,
        client_type=client_type,
        client_id=client_id,
        client_version=client_version,
        bypass_cache=test,
        private=private,
    )

    # Get URL for response synthesized speech audio
    if voice:
        # If the result contains a "voice" key, return it
        audio = result.get("voice")
        url = get_synthesized_text_url(audio,
                                       voice_id=voice_id) if audio else None
        if url:
            result["audio"] = url
        response = result.get("response")
        if response:
            if "sources" in response:
                # A list of sources is not needed for voice results
                del response["sources"]
            if "answers" in response:
                answers = response["answers"]
                # If there is a multi-item answer list
                # in the response, delete all but the first
                # item in the list to simplify the response
                if isinstance(answers, list):
                    del answers[1:]
    else:
        if "voice" in result:
            # Voice result not needed, so don't send it to the client
            del result["voice"]

    return better_jsonify(**result)
Example #5
0
             try:
                 parser.parse_args(['--help'])
             except SystemExit:
                 continue
         elif response == 'flask':
             app.run(debug=False)
         else:
             try:
                 args = parser.parse_args(response.split(' '))
             except:
                 print("invalid input detected please retry")
                 continue
             if parser.error_message:
                 print(parser.error_message)
                 parser.error_message = ''
                 continue
             if args.linechart:
                 draw_line_chart(args, data_base_path)
             else:
                 result, error = process_query(args, data_base_path)
                 if error:
                     print(error)
                     continue
                 if args.bar:
                     draw_bar_chart(result, args)
                 else:
                     if len(result) > 0:
                         print_results(result, args)
                     else:
                         print("No records found")
 print("Bye!")