def parse_folder(path): """ Parses all .bib files in given folder. Returns a tuple (parsed_iten, search_index) containing all items found """ if not os.path.isdir(path): raise Exception("Path to folder expected") parsed_items = [] files = utils.search_in_folder(path, lambda path: path.endswith(".bib")) executor = concurrent.futures.ProcessPoolExecutor( max_workers=multiprocessing.cpu_count()) futures = [ executor.submit(BibParser()._parse_file, os.path.join(path, filename)) for filename in files ] for future in futures: parsed_items += future.result() executor.shutdown() parsed_items = list( sorted(parsed_items, key=BibItem.key_to_key_func(const.DEFAULT_ORDER_BY))) item_index = search_index.Index(parsed_items) fin_ctx = FinalizingContext(item_index) for item in parsed_items: item.finalize_item_set(fin_ctx) item_index.update(parsed_items) return (parsed_items, item_index)
def test_inverted_index_search(self): items = bib_parser.BibParser()._parse_string(TEST_ITEMS) item_index = index.Index(items) DIRECT_KEY = "cinquecento" INVERTED_KEY = const.INVERTED_INDEX_KEY_PREFIX + DIRECT_KEY subindex = item_index["keywords"] self.assertIn(DIRECT_KEY, subindex) self.assertIn(INVERTED_KEY, subindex) filtered_items = item_index["keywords"][INVERTED_KEY] self.assertEqual(len(filtered_items), 1) self.assertEqual(utils.first(filtered_items).id(), "id_2")
def test_search_items(self): """ Tests if parsed items can be searched by a bunch of parameters """ items = bib_parser.BibParser()._parse_string(TEST_ITEMS) item_index = index.Index(items) author_search = search.search_for_iterable("author", "Петров") filtered_items = filter(author_search, items) self.assertEqual(len(list(filtered_items)), 1) #testing exact match year_search = search.and_([ search.search_for("year_from", 1825), search.search_for("year_to", 1825) ]) filtered_items = filter(year_search, items) self.assertEqual(len(list(filtered_items)), 1) #testing partial intersection year_search = search.and_([ search.search_for("year_from", 1500), search.search_for("year_to", 1600) ]) filtered_items = filter(year_search, items) self.assertEqual(len(list(filtered_items)), 1) #testing inner containment year_search = search.and_([ search.search_for("year_from", 1499), search.search_for("year_to", 1501) ]) filtered_items = filter(year_search, items) self.assertEqual(len(list(filtered_items)), 1) #testing outer containment year_search = search.and_([ search.search_for("year_from", 1400), search.search_for("year_to", 1600) ]) filtered_items = filter(year_search, items) self.assertEqual(len(list(filtered_items)), 1) filtered_items = item_index["keywords"]["grumbling"] self.assertEqual(len(list(filtered_items)), 1) filtered_items = \ item_index["keywords"]["cinquecento"] & \ item_index["keywords"]["historical dance"] self.assertEqual(len(list(filtered_items)), 1)
def test_parse_string(self): """ Tests if string can be succesfully parsed by BibParser """ items = bib_parser.BibParser()._parse_string(TEST_ITEMS) item_index = index.Index(items) languages = set(langid for langid in item_index["langid"].keys() if not langid.startswith("!")) keywords = set(item_index["keywords"].keys()) self.assertEqual(len(items), 2) self.assertEqual(languages, EXPECTED_LANGUAGES) self.assertEqual(keywords, EXPECTED_KEYWORDS) item1 = next(iter(item_index["id"]["id_1"])) self.assertTrue('{' not in item1.title()) self.assertTrue('}' not in item1.title())