def run_search(dict_file, postings_file, queries_file, results_file): """Using the given dictionary file and postings impt_wordsearching on the given queries file and output the results to a file Args: dict_file: The dictionary filename postings_file: The postings file filename queries_file: The query filename results_file: The filename to write our output """ print('running search on the queries...') infile = open(dict_file, 'rb') new_dict = pickle.load(infile) infile.close() in_file = open(queries_file, 'r', encoding="utf8") out_file = open(results_file, 'w', encoding="utf8") posting_file = open(postings_file, 'rb') query_list = in_file.read().splitlines() while query_list: query = query_list.pop(0) if (not query): out_file.write("") else: out_file.write(process_query(query, new_dict, posting_file, use_prf=USE_PRF)) if query_list: out_file.write('\n') in_file.close() out_file.close()
def run(args): if args.build: model = build(args) if args.best: args.rocchio = True try: print('loading model from %s' % args.load_model) model = VSM(model_path=args.load_model) except: print('failed to load model, build from raw.') model = build(args) query_list, query_id = process_query(args.query_file) output_file = open(args.ranked_list, 'w+') print('query_id,retrieved_docs', file=output_file) for i, query in enumerate(query_list): doc_id, doc_score = model.get_ranking(query, args.rocchio) print('%s,%s' % (query_id[i], ' '.join(doc_id[:100])), file=output_file)
def run(): import pandas as pd model = VSM(model_path='./model.pl') query_list, query_id = process_query('../queries/query-train.xml') answer = pd.read_csv('../queries/ans_train.csv')[['retrieved_docs']].values answer = [a[0].strip().split(' ') for a in answer] for p in product_dict(params): model.k1 = p['k1'] model.b = p['b'] model.k3 = p['k3'] model.tf_type = p['tf'] model.idf_type = p['idf'] model.doc_len_norm = p['norm'] model._compute_idf() score = [] for i, query in enumerate(query_list): doc_id, doc_score = model.get_ranking(query, p['rocchio'], p['n'], p['k']) score.append(MAP(doc_id[:100], answer[i])) score = sum(score) / len(score) model_str = 'k1=%.2f,b=%.2f,k3=%d,idf_type=%s,rocchio=%s,n=%d,k=%d,score=%.5f' % (p['k1'], p['b'], p['k3'], p['idf'], p['rocchio'], p['n'], p['k'], score) print(model_str)
def query_api(version=1): """ Respond to a query string """ if not (1 <= version <= 1): return better_jsonify(valid=False, reason="Unsupported version") # String with query q = request.values.get("q", "") # q param contains one or more |-separated strings mq = q.split("|")[0:_MAX_QUERY_VARIANTS] # Retain only nonempty strings in q q = list(filter(None, (m.strip()[0:_MAX_QUERY_LENGTH] for m in mq))) # If voice is set, return a voice-friendly string voice = bool_from_request(request, "voice") # Request a particular voice voice_id = request.values.get("voice_id") # If test is set to True (which is only possible in a debug setting), we # (1) add a synthetic location, if not given; and # (2) bypass the cache test = Settings.DEBUG and bool_from_request(request, "test") # Obtain the client's location, if present lat = request.values.get("latitude") lon = request.values.get("longitude") # Additional client info client_id = request.values.get("client_id") client_type = request.values.get("client_type") client_version = request.values.get("client_version") # When running behind an nginx reverse proxy, the client's remote # address is passed to the web application via the "X-Real-IP" header client_ip = request.remote_addr or request.headers.get("X-Real-IP") # Query is marked as private and shouldn't be logged private = bool_from_request(request, "private") # Attempt to convert the (lat, lon) location coordinates to floats location_present = bool(lat) and bool(lon) # For testing, insert a synthetic location if not already present if not location_present and test: lat, lon = _MIDEIND_LOCATION location_present = True if location_present: try: lat = float(lat) if not (-90.0 <= lat <= 90.0): location_present = False except ValueError: location_present = False if location_present: try: lon = float(lon) if not (-180.0 <= lon <= 180.0): location_present = False except ValueError: location_present = False # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON auto_uppercase = bool_from_request(request, "autouppercase", True) # Send the query to the query processor result = process_query( q, voice, auto_uppercase=auto_uppercase, location=(lat, lon) if location_present else None, remote_addr=client_ip, client_type=client_type, client_id=client_id, client_version=client_version, bypass_cache=test, private=private, ) # Get URL for response synthesized speech audio if voice: # If the result contains a "voice" key, return it audio = result.get("voice") url = get_synthesized_text_url(audio, voice_id=voice_id) if audio else None if url: result["audio"] = url response = result.get("response") if response: if "sources" in response: # A list of sources is not needed for voice results del response["sources"] if "answers" in response: answers = response["answers"] # If there is a multi-item answer list # in the response, delete all but the first # item in the list to simplify the response if isinstance(answers, list): del answers[1:] else: if "voice" in result: # Voice result not needed, so don't send it to the client del result["voice"] return better_jsonify(**result)
try: parser.parse_args(['--help']) except SystemExit: continue elif response == 'flask': app.run(debug=False) else: try: args = parser.parse_args(response.split(' ')) except: print("invalid input detected please retry") continue if parser.error_message: print(parser.error_message) parser.error_message = '' continue if args.linechart: draw_line_chart(args, data_base_path) else: result, error = process_query(args, data_base_path) if error: print(error) continue if args.bar: draw_bar_chart(result, args) else: if len(result) > 0: print_results(result, args) else: print("No records found") print("Bye!")