Ejemplo n.º 1
0
    def __init__(self):
        self.lexicon = Lexicon(DICT_PATH)

        self.indexedDocs = self.loadIndexedDocs()
        self.metadata = self.loadMetadata()

        self.forwardIndexer = ForwardIndexer(self.indexedDocs)
        self.invertedIndexer = InvertedIndexer()
def main():
    lexicon = Lexicon(config.LEXICON_PATH)

    lexicon.generate_lexicon(list(config.dataset_files(1)))
    lexicon.generate_lexicon(list(config.dataset_files()))
    lexicon_dict = lexicon.get_lexicon_dict()
    print(len(lexicon_dict))
    word_id = lexicon.get_word_id("Dear")
    print(word_id)
    word_exists = lexicon.get_word_id("blablabla")
    print(word_exists)
Ejemplo n.º 3
0
def main():
	parser = argparse.ArgumentParser()
	subparser = parser.add_subparsers(dest='subparser')

	lexicon_argparser = subparser.add_parser("generate_lexicon")
	lexicon_argparser.add_argument('--b_range', type=str, help="Batches numbers range start and end creating/updating lexicon from. For example 1,3")
	lexicon_argparser.add_argument('--d', type=int, default=0, help="Print demo results.")

	forward_index_argparser = subparser.add_parser("generate_forward_index")
	forward_index_argparser.add_argument('--b_range', type=str, help="Batches numbers range start and end creating/updating forward index from. For example 1,3")
	forward_index_argparser.add_argument('--d', type=int, default=0, help="Print demo results.")

	inverted_index_argparser = subparser.add_parser("generate_inverted_index")
	inverted_index_argparser.add_argument('--b', type=str, help="Forward Index Batches to create inverted_index from. Comma Separated.")
	inverted_index_argparser.add_argument('--d', type=int, default=0, help="Print demo results.")

	search = subparser.add_parser("search")
	search.add_argument("--q", type=str, help="Search Query.")

	args = parser.parse_args()

	if args.subparser == 'generate_lexicon':
		batche_range = list(map(int, args.b_range.split(",")))
		generate_lexicon.main(*batche_range, demo=args.d)
	elif args.subparser == 'generate_forward_index':
		batche_range = list(map(int, args.b_range.split(",")))
		generate_forward_index.main(*batche_range, demo=args.d)
	elif args.subparser == 'generate_inverted_index':
		batches = args.b.split(',')
		generate_inverted_index.main(batches, demo=args.d)
	elif args.subparser == 'search':
		lexicon = Lexicon(config.LEXICON_PATH)
		inverted_index = InvertedIndex(config.INVERTED_INDEX_BARRELS_PATH, config.INVERTED_INDEX_BARRELS_TEMP_PATH, len(lexicon), config.INVERTED_INDEX_BARREL_SIZE)
		search = Search(lexicon, inverted_index)
		print(search.search(args.q))
def main(batch_start, batch_end, demo=False):
    lexicon = Lexicon(config.LEXICON_PATH)

    lexicon.generate_lexicon(config.dataset_files(batch_start, batch_end))

    print(f"Lexicon created with {len(lexicon)} words.")
    print('-' * 32)

    if not demo: return

    ### DEMO PRINTING ###

    PRINT_N = 10
    print("### DEMO TEST ###")
    print(f"{PRINT_N} words from the lexicon are: ")

    lexicon_dict = lexicon.get_lexicon_dict()

    for i, word in enumerate(lexicon_dict):
        if i >= PRINT_N: break
        print(f"\t{word}: {lexicon_dict[word]}")

    print('-' * 32)
def main(batch_start, batch_end, demo=False):

	lexicon = Lexicon(config.LEXICON_PATH)

	forward_index = ForwardIndex(config.FORWARD_INDEX_BARRELS_PATH, lexicon)

	with concurrent.futures.ThreadPoolExecutor() as executor:

		threads = []

		if batch_start == batch_end:
			batch_1_thread = executor.submit(forward_index.add_to_forward_index, config.dataset_files(batch_start, batch_start + 1), f"batch_00{batch_start}")
			threads.append(batch_1_thread)
		else:
			mid = int((batch_end + batch_start) / 2)
			batch_1_thread = executor.submit(forward_index.add_to_forward_index, config.dataset_files(batch_start, mid), f"batch_00{batch_start}")
			batch_2_thread = executor.submit(forward_index.add_to_forward_index, config.dataset_files(mid, batch_end), f"batch_00{mid}")
			threads.append(batch_1_thread)
			threads.append(batch_2_thread)

		for f in concurrent.futures.as_completed(threads):
			print(f"{f.result()} forward_index created.")

	if not demo: return
	
	### DEMO PRINTING ###

	print('-'*32)

	PRINT_BARREL = 0
	PRINT_N = 2

	print("### DEMO TEST ###")
	print(f"{PRINT_N} entrie(s) from barrel {PRINT_BARREL}:")

	with open(os.path.join(config.FORWARD_INDEX_BARRELS_PATH, f"batch_00{PRINT_BARREL}"), 'rb') as forward_index_file:
		forward_index = pickle.load(forward_index_file)

		for i, doc_id in enumerate(forward_index):

			if i >= PRINT_N: break

			print(f"\t{doc_id}:")
			for word_id in forward_index[doc_id]:
				print(f"\t\t{word_id}: {forward_index[doc_id][word_id]}")

	print('-'*32)
def main(forward_index_batches, demo=False):
    lexicon = Lexicon(config.LEXICON_PATH)

    inverted_index = InvertedIndex(config.INVERTED_INDEX_BARRELS_PATH,
                                   config.INVERTED_INDEX_BARRELS_TEMP_PATH,
                                   len(lexicon),
                                   config.INVERTED_INDEX_BARREL_SIZE)

    with concurrent.futures.ThreadPoolExecutor() as executor:
        threads = []
        for fib in forward_index_batches:
            thread = executor.submit(
                inverted_index.invert_forward_index,
                os.path.join(config.FORWARD_INDEX_BARRELS_PATH, fib))

        for f in concurrent.futures.as_completed(threads):
            print(f"{f.result()} created.")

    inverted_index.merge_buckets()

    if not demo: return

    ### DEMO PRINTING ###

    print('-' * 32)

    PRINT_BARREL = 3
    PRINT_N = 30

    print("### DEMO TEST ###")
    print(f"{PRINT_N} entries from barrel {PRINT_BARREL}:")

    with open(
            os.path.join(config.INVERTED_INDEX_BARRELS_PATH,
                         f"{PRINT_BARREL:03}_inverted"),
            'rb') as inverted_index_file:
        inverted_index = pickle.load(inverted_index_file)

        for i, word_id in enumerate(inverted_index):
            if i >= PRINT_N: break

            print(f"\t{word_id}:")

            for doc in inverted_index[word_id]:
                print(f"\t\t{doc}: {inverted_index[word_id][doc]}")
def main():
    lexicon = Lexicon(config.LEXICON_PATH)
    forward_index = ForwardIndex(config.FORWARD_INDEX_BARRELS_PATH, lexicon)

    forward_index.add_to_forward_index(list(config.dataset_files(0, 1)),
                                       'aftab_test_forward_1')
import os
import json
import config
from indexing.lexicon import Lexicon
from indexing.inverted_index import InvertedIndex
from search.search import Search

# flask app & Api
app = Flask(__name__)
api = Api(app)
cors = CORS(app)
app.config['CORS_HEADERS'] = 'Content-Type'

# Indexes
lexicon = Lexicon(config.LEXICON_PATH)
inverted_index = InvertedIndex(config.INVERTED_INDEX_BARRELS_PATH,
                               config.INVERTED_INDEX_BARRELS_TEMP_PATH,
                               len(lexicon), config.INVERTED_INDEX_BARREL_SIZE)
search = Search(lexicon, inverted_index)


# for handling searches
class Setup(Resource):
    @cross_origin()
    def get(self):
        return render_template('index.html')


class Document(Resource):
    def get(self, doc_id):
Ejemplo n.º 9
0
class Indexer:
    """docstring for Indexer"""
    def __init__(self):
        self.lexicon = Lexicon(DICT_PATH)

        self.indexedDocs = self.loadIndexedDocs()
        self.metadata = self.loadMetadata()

        self.forwardIndexer = ForwardIndexer(self.indexedDocs)
        self.invertedIndexer = InvertedIndexer()

    def addFile(self, dictDir, file):
        """
		arguments:
			- dictDir: the path of the directory containing the
			dictionaries for the forward and the inverted index
			- file: the path to the file that is to be added

		This function updates the lexicon to accommodate the
		new file and adds the file to the forward and inverted
		indexes.

		return: None
		"""

        # if document is already indexed, return
        if self.indexedDocs.get(file[-21:]) is not None:
            print(datetime.now(), "Document already present in index.")
            return

        print(file)
        print(datetime.now(), "Adding document to index.")

        # get author name, title, all texts, url, weightedShares and file path of given file
        author, title, tokens, url, shares, filepath = readFile(file)

        # clean the texts for short and long barreling
        shortTokens = clean(author + " " + title)
        tokens = clean(tokens)

        # add tokens to lexicon
        self.lexicon.processFile(tokens)
        self.lexicon.processFile(shortTokens)

        # get unique, sorted wordIDs present in the file
        wordIDs = sorted(set([self.lexicon.lexDict[token]
                              for token in tokens]))
        shortWordIDs = sorted(
            set([self.lexicon.lexDict[token] for token in shortTokens]))

        # get all barrels that are to be updated
        barrels = sorted(set([getBarrel(wordID) for wordID in wordIDs]))
        shortBarrels = sorted(
            set([getBarrel(wordID) for wordID in shortWordIDs]))

        # add data to long and short forward barrels
        shortForwardBarrels, _ = self.forwardIndexer.addFile(dictDir,
                                                             self.lexicon,
                                                             shortTokens,
                                                             shortBarrels,
                                                             short=True)
        forwardBarrels, docID = self.forwardIndexer.addFile(
            dictDir, self.lexicon, tokens, barrels)

        # add data to long and short inverted barrels
        self.invertedIndexer.addFile(dictDir,
                                     shortWordIDs,
                                     docID,
                                     shortBarrels,
                                     shortForwardBarrels,
                                     short=True)
        self.invertedIndexer.addFile(dictDir, wordIDs, docID, barrels,
                                     forwardBarrels)

        print(datetime.now(), "Document added to index.")

        # add documentID into indexedDocs so it is not indexed again
        self.indexedDocs[file[-21:]] = docID

        # store document's metadata
        self.addMetadata(docID, author, title, url, shares, filepath)
        print(docID)
        forwardBarrels.clear()

    def indexDataset(self):
        """
		This function will iterate over the dataset provided
		in DATASET_PATH and will index it. The indexes and
		lexicon will be written to the DICT_PATH directory.

		return: None
		"""
        shortForwardBarrels = dict()
        forwardBarrels = dict()

        print(datetime.now(), "Generating lexicon and forward index.")

        for folder in os.listdir(DATA_PATH):
            FILE_PATH = DATA_PATH + '/' + folder

            for file in tqdm(os.listdir(FILE_PATH)):
                path = FILE_PATH + '/' + file

                # make sure document is not already indexed
                if self.indexedDocs.get(path[-21:]) is not None:
                    continue

                # get author name, title, all texts, url, weightedShares and file path of given file
                author, title, tokens, url, shares, filepath = readFile(path)

                # make tokens for long and short barreling
                shortTokens = clean(author + " " + title)
                tokens = clean(tokens)

                # add tokens to lexicon
                self.lexicon.processFile(shortTokens)
                self.lexicon.processFile(tokens)

                # index short barrels
                self.forwardIndexer.processFile(self.lexicon,
                                                shortForwardBarrels,
                                                shortTokens,
                                                short=True)

                # index long barrels
                self.forwardIndexer.processFile(self.lexicon, forwardBarrels,
                                                tokens)

                # record that document has been indexed
                self.indexedDocs[path[-21:]] = self.forwardIndexer.docID - 1

                # store document's metadata
                self.addMetadata(self.forwardIndexer.docID - 1, author, title,
                                 url, shares, filepath)

        # dump short barrels
        print(datetime.now(), "Writing short forward index to file.")
        self.forwardIndexer.dump(DICT_PATH,
                                 shortForwardBarrels,
                                 overwrite=False,
                                 short=True)

        # dump long barrels
        print(datetime.now(), "Writing long forward index to file.")
        self.forwardIndexer.dump(DICT_PATH, forwardBarrels, overwrite=False)
        forwardBarrels.clear()

        # invert short barrels
        print(datetime.now(), "Generating short inverted index.")
        for file in os.listdir(os.path.join(DICT_PATH,
                                            'short_forward_barrels')):
            self.invertedIndexer.processFile(DICT_PATH,
                                             file,
                                             int(file[8:-5]),
                                             short=True)

        # invert long barrels
        print(datetime.now(), "Generating long inverted index.")
        for file in os.listdir(os.path.join(DICT_PATH, 'forward_barrels')):
            self.invertedIndexer.processFile(DICT_PATH, file, int(file[8:-5]))

        print(datetime.now(), "Indexing complete.")

    def addMetadata(self, docID, author, title, url, shares, filepath):
        # store arguments in metadata dictionary
        self.metadata[str(docID)] = [title, author, url, shares, filepath]

    def loadIndexedDocs(self):

        # load and return indexedDocs
        try:
            with open(os.path.join(DICT_PATH, 'indexed_docs.json'),
                      'r',
                      encoding="utf8") as f:
                indexedDocs = json.load(f)
        except FileNotFoundError:
            indexedDocs = dict()
        return indexedDocs

    def loadMetadata(self):

        # load and return metadata
        try:
            with open(os.path.join(DICT_PATH, 'metadata.json'),
                      'r',
                      encoding="utf8") as f:
                metadata = json.load(f)
        except FileNotFoundError:
            metadata = dict()
        return metadata