import json, os, sys from pprint import pprint from Document.get_document import read_json, make_document if __name__ == '__main__': bookkeeping_path = os.path.join(os.getcwd(), 'WEBPAGES_RAW', "bookkeeping.json") lookup = read_json(bookkeeping_path) with open('index.json') as index_file: inverted_index = json.load(index_file) while True: query = raw_input("Input term you want to find. Otherwise, press !q to quit ") if query == "!q": break if query in inverted_index: for i,p in enumerate(inverted_index[query][:10],start=1): print("{} : {}".format(i,lookup[p['doc_id']])) else: print("Query {} not found".format(query)) print("Thanks for searching")
from __future__ import print_function from flask import Flask, render_template, request import os from flask_pymongo import PyMongo from Document.get_document import read_json from handle_query import Query from time import time BOOKKEEPING_PATH = os.path.join(os.getcwd(), 'WEBPAGES_RAW', "bookkeeping.json") LOOKUP = read_json(BOOKKEEPING_PATH) DEBUG = True #FOR PRINTING app = Flask(__name__) app.config["TEMPLATES_AUTO_RELOAD"] = True app.config["MONGO_HOST"] = "127.0.0.1" app.config["MONGO_DBNAME"] = "***REDACTED***" app.config["MONGO_USERNAME"] = "******" app.config["MONGO_PASSWORD"] = "******" app.config["MONGO_CONNECT"] = True mongo = PyMongo(app) @app.route('/') def hello_world(): return render_template('index.html') #https://stackoverflow.com/questions/12277933/send-data-from-a-textbox-into-flask @app.route('/', methods=['GET', 'POST']) def query_post(): if DEBUG:
def insert_documents(number=None): warnings.filterwarnings("ignore", category=UserWarning, module='bs4') bookkeeping_path = os.path.join(os.getcwd(), 'WEBPAGES_RAW', "bookkeeping.json") times = [time()] docs = make_document(read_json(bookkeeping_path), number) times.append(time()) print("(1 / 5) Made Documents. Tokenizing...") td_list = [] for i, doc in enumerate(docs): td = TokenizeDocument(doc) td.parse() td_list.append(td) if i and not i % 1000: print("\t -> Tokenized {} Documents".format(i)) times.append(time()) print("(2 / 5) Tokenized Documents. Reducing...") red_index = ReduceIndex(td_list) red_index.reduce() times.append(time()) print("(3 / 5) Reduced Index. Calculating...") red_index.calc_tf_idf() times.append(time()) print("(4 / 5) TF IDF Calculated. Vectorizing...") dv_list = [] for doc_id, terms in red_index.doc_terms.items(): dv = DocumentVector(doc_id, terms, red_index.doc_metric[doc_id]) dv.make_vector_frame() dv.normalize() check = dv.vector_frame[dv.vector_frame["tf_idf"] > 1] if not check.empty: print(check) dv_list.append({ "doc_id": doc_id, "term": dv.vector_frame["term"].values.tolist(), "tf_idf": dv.vector_frame["tf_idf"].values.tolist() }) times.append(time()) print("(5 / 5) Vector Space created.") # db = HandleDB() # # db.database["reduced_terms"].drop() # db.database["term_count"].drop() # db.database["document_vector"].drop() # # db.insert_dict(red_index.reduced_terms, # key="term", # value="posting", # collection="reduced_terms") # db.insert_dict(red_index.term_count, # key="term", # value="count", # collection="term_count") # # print(dv_list) # db.insert_list(dv_list, collection="document_vector") times.append(time()) for i in range(len(times) - 1): sub = times[i + 1] - times[i] print("Section {} took {} seconds.".format(i + 1, round(sub, 3))) seconds = round(times[-1] - times[0], 3) print("This process took {} seconds ({})".format( seconds, timedelta(seconds=seconds)))