Ejemplo n.º 1
0
def main():
	parser = argparse.ArgumentParser()
	subparser = parser.add_subparsers(dest='subparser')

	lexicon_argparser = subparser.add_parser("generate_lexicon")
	lexicon_argparser.add_argument('--b_range', type=str, help="Batches numbers range start and end creating/updating lexicon from. For example 1,3")
	lexicon_argparser.add_argument('--d', type=int, default=0, help="Print demo results.")

	forward_index_argparser = subparser.add_parser("generate_forward_index")
	forward_index_argparser.add_argument('--b_range', type=str, help="Batches numbers range start and end creating/updating forward index from. For example 1,3")
	forward_index_argparser.add_argument('--d', type=int, default=0, help="Print demo results.")

	inverted_index_argparser = subparser.add_parser("generate_inverted_index")
	inverted_index_argparser.add_argument('--b', type=str, help="Forward Index Batches to create inverted_index from. Comma Separated.")
	inverted_index_argparser.add_argument('--d', type=int, default=0, help="Print demo results.")

	search = subparser.add_parser("search")
	search.add_argument("--q", type=str, help="Search Query.")

	args = parser.parse_args()

	if args.subparser == 'generate_lexicon':
		batche_range = list(map(int, args.b_range.split(",")))
		generate_lexicon.main(*batche_range, demo=args.d)
	elif args.subparser == 'generate_forward_index':
		batche_range = list(map(int, args.b_range.split(",")))
		generate_forward_index.main(*batche_range, demo=args.d)
	elif args.subparser == 'generate_inverted_index':
		batches = args.b.split(',')
		generate_inverted_index.main(batches, demo=args.d)
	elif args.subparser == 'search':
		lexicon = Lexicon(config.LEXICON_PATH)
		inverted_index = InvertedIndex(config.INVERTED_INDEX_BARRELS_PATH, config.INVERTED_INDEX_BARRELS_TEMP_PATH, len(lexicon), config.INVERTED_INDEX_BARREL_SIZE)
		search = Search(lexicon, inverted_index)
		print(search.search(args.q))
Ejemplo n.º 2
0
    def __init__(self):
        self.lexicon = Lexicon(DICT_PATH)

        self.indexedDocs = self.loadIndexedDocs()
        self.metadata = self.loadMetadata()

        self.forwardIndexer = ForwardIndexer(self.indexedDocs)
        self.invertedIndexer = InvertedIndexer()
def main():
    lexicon = Lexicon(config.LEXICON_PATH)

    lexicon.generate_lexicon(list(config.dataset_files(1)))
    lexicon.generate_lexicon(list(config.dataset_files()))
    lexicon_dict = lexicon.get_lexicon_dict()
    print(len(lexicon_dict))
    word_id = lexicon.get_word_id("Dear")
    print(word_id)
    word_exists = lexicon.get_word_id("blablabla")
    print(word_exists)
def main(batch_start, batch_end, demo=False):

	lexicon = Lexicon(config.LEXICON_PATH)

	forward_index = ForwardIndex(config.FORWARD_INDEX_BARRELS_PATH, lexicon)

	with concurrent.futures.ThreadPoolExecutor() as executor:

		threads = []

		if batch_start == batch_end:
			batch_1_thread = executor.submit(forward_index.add_to_forward_index, config.dataset_files(batch_start, batch_start + 1), f"batch_00{batch_start}")
			threads.append(batch_1_thread)
		else:
			mid = int((batch_end + batch_start) / 2)
			batch_1_thread = executor.submit(forward_index.add_to_forward_index, config.dataset_files(batch_start, mid), f"batch_00{batch_start}")
			batch_2_thread = executor.submit(forward_index.add_to_forward_index, config.dataset_files(mid, batch_end), f"batch_00{mid}")
			threads.append(batch_1_thread)
			threads.append(batch_2_thread)

		for f in concurrent.futures.as_completed(threads):
			print(f"{f.result()} forward_index created.")

	if not demo: return
	
	### DEMO PRINTING ###

	print('-'*32)

	PRINT_BARREL = 0
	PRINT_N = 2

	print("### DEMO TEST ###")
	print(f"{PRINT_N} entrie(s) from barrel {PRINT_BARREL}:")

	with open(os.path.join(config.FORWARD_INDEX_BARRELS_PATH, f"batch_00{PRINT_BARREL}"), 'rb') as forward_index_file:
		forward_index = pickle.load(forward_index_file)

		for i, doc_id in enumerate(forward_index):

			if i >= PRINT_N: break

			print(f"\t{doc_id}:")
			for word_id in forward_index[doc_id]:
				print(f"\t\t{word_id}: {forward_index[doc_id][word_id]}")

	print('-'*32)
def main(forward_index_batches, demo=False):
    lexicon = Lexicon(config.LEXICON_PATH)

    inverted_index = InvertedIndex(config.INVERTED_INDEX_BARRELS_PATH,
                                   config.INVERTED_INDEX_BARRELS_TEMP_PATH,
                                   len(lexicon),
                                   config.INVERTED_INDEX_BARREL_SIZE)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        threads = []
        for fib in forward_index_batches:
            thread = executor.submit(
                inverted_index.invert_forward_index,
                os.path.join(config.FORWARD_INDEX_BARRELS_PATH, fib))

        for f in concurrent.futures.as_completed(threads):
            print(f"{f.result()} created.")

    inverted_index.merge_buckets()

    if not demo: return

    ### DEMO PRINTING ###

    print('-' * 32)

    PRINT_BARREL = 3
    PRINT_N = 30

    print("### DEMO TEST ###")
    print(f"{PRINT_N} entries from barrel {PRINT_BARREL}:")

    with open(
            os.path.join(config.INVERTED_INDEX_BARRELS_PATH,
                         f"{PRINT_BARREL:03}_inverted"),
            'rb') as inverted_index_file:
        inverted_index = pickle.load(inverted_index_file)

        for i, word_id in enumerate(inverted_index):
            if i >= PRINT_N: break

            print(f"\t{word_id}:")

            for doc in inverted_index[word_id]:
                print(f"\t\t{doc}: {inverted_index[word_id][doc]}")
def main(batch_start, batch_end, demo=False):
    lexicon = Lexicon(config.LEXICON_PATH)

    lexicon.generate_lexicon(config.dataset_files(batch_start, batch_end))

    print(f"Lexicon created with {len(lexicon)} words.")
    print('-' * 32)

    if not demo: return

    ### DEMO PRINTING ###

    PRINT_N = 10
    print("### DEMO TEST ###")
    print(f"{PRINT_N} words from the lexicon are: ")

    lexicon_dict = lexicon.get_lexicon_dict()

    for i, word in enumerate(lexicon_dict):
        if i >= PRINT_N: break
        print(f"\t{word}: {lexicon_dict[word]}")

    print('-' * 32)
def main():
    lexicon = Lexicon(config.LEXICON_PATH)
    forward_index = ForwardIndex(config.FORWARD_INDEX_BARRELS_PATH, lexicon)

    forward_index.add_to_forward_index(list(config.dataset_files(0, 1)),
                                       'aftab_test_forward_1')
import os
import json
import config
from indexing.lexicon import Lexicon
from indexing.inverted_index import InvertedIndex
from search.search import Search

# flask app & Api
app = Flask(__name__)
api = Api(app)
cors = CORS(app)
app.config['CORS_HEADERS'] = 'Content-Type'

# Indexes
lexicon = Lexicon(config.LEXICON_PATH)
inverted_index = InvertedIndex(config.INVERTED_INDEX_BARRELS_PATH,
                               config.INVERTED_INDEX_BARRELS_TEMP_PATH,
                               len(lexicon), config.INVERTED_INDEX_BARREL_SIZE)
search = Search(lexicon, inverted_index)


# for handling searches
class Setup(Resource):
    @cross_origin()
    def get(self):
        return render_template('index.html')


class Document(Resource):
    def get(self, doc_id):