コード例 #1
0
import json, os, sys
from pprint import pprint
from Document.get_document import read_json, make_document



if __name__ == '__main__':
	bookkeeping_path = os.path.join(os.getcwd(), 'WEBPAGES_RAW', "bookkeeping.json")
	lookup = read_json(bookkeeping_path)
	with open('index.json') as index_file:
		inverted_index = json.load(index_file)
	while True:
		query = raw_input("Input term you want to find. Otherwise, press !q to quit ")
		if query == "!q":
			break
		if query in inverted_index:
            for i,p in enumerate(inverted_index[query][:10],start=1):
                print("{} : {}".format(i,lookup[p['doc_id']]))
		else:
			print("Query {} not found".format(query))
	print("Thanks for searching")
コード例 #2
0
from __future__ import print_function
from flask import Flask, render_template, request
import os
from flask_pymongo import PyMongo
from Document.get_document import read_json
from handle_query import Query
from time import time

BOOKKEEPING_PATH = os.path.join(os.getcwd(), 'WEBPAGES_RAW',
                                "bookkeeping.json")
LOOKUP = read_json(BOOKKEEPING_PATH)
DEBUG = True  #FOR PRINTING
app = Flask(__name__)
app.config["TEMPLATES_AUTO_RELOAD"] = True
app.config["MONGO_HOST"] = "127.0.0.1"
app.config["MONGO_DBNAME"] = "***REDACTED***"
app.config["MONGO_USERNAME"] = "******"
app.config["MONGO_PASSWORD"] = "******"
app.config["MONGO_CONNECT"] = True
mongo = PyMongo(app)


@app.route('/')
def hello_world():
    return render_template('index.html')


#https://stackoverflow.com/questions/12277933/send-data-from-a-textbox-into-flask
@app.route('/', methods=['GET', 'POST'])
def query_post():
    if DEBUG:
コード例 #3
0
def insert_documents(number=None):
    warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
    bookkeeping_path = os.path.join(os.getcwd(), 'WEBPAGES_RAW',
                                    "bookkeeping.json")

    times = [time()]
    docs = make_document(read_json(bookkeeping_path), number)
    times.append(time())
    print("(1 / 5) Made Documents. Tokenizing...")

    td_list = []
    for i, doc in enumerate(docs):
        td = TokenizeDocument(doc)
        td.parse()
        td_list.append(td)
        if i and not i % 1000:
            print("\t -> Tokenized {} Documents".format(i))
    times.append(time())
    print("(2 / 5) Tokenized Documents. Reducing...")

    red_index = ReduceIndex(td_list)
    red_index.reduce()
    times.append(time())
    print("(3 / 5) Reduced Index. Calculating...")

    red_index.calc_tf_idf()
    times.append(time())
    print("(4 / 5) TF IDF Calculated. Vectorizing...")

    dv_list = []
    for doc_id, terms in red_index.doc_terms.items():
        dv = DocumentVector(doc_id, terms, red_index.doc_metric[doc_id])
        dv.make_vector_frame()
        dv.normalize()
        check = dv.vector_frame[dv.vector_frame["tf_idf"] > 1]
        if not check.empty:
            print(check)
        dv_list.append({
            "doc_id": doc_id,
            "term": dv.vector_frame["term"].values.tolist(),
            "tf_idf": dv.vector_frame["tf_idf"].values.tolist()
        })

    times.append(time())
    print("(5 / 5) Vector Space created.")

    # db = HandleDB()
    #
    # db.database["reduced_terms"].drop()
    # db.database["term_count"].drop()
    # db.database["document_vector"].drop()
    #
    # db.insert_dict(red_index.reduced_terms,
    #                key="term",
    #                value="posting",
    #                collection="reduced_terms")
    # db.insert_dict(red_index.term_count,
    #                key="term",
    #                value="count",
    #                collection="term_count")
    # # print(dv_list)
    # db.insert_list(dv_list, collection="document_vector")

    times.append(time())

    for i in range(len(times) - 1):
        sub = times[i + 1] - times[i]
        print("Section {} took {} seconds.".format(i + 1, round(sub, 3)))
    seconds = round(times[-1] - times[0], 3)
    print("This process took {} seconds ({})".format(
        seconds, timedelta(seconds=seconds)))