class BookInventory(object): _BOOK_META_ID_INDEX = 0 _BOOK_META_TITLE_INDEX = 1 #question _BOOK_META_AUTHOR_INDEX = 2 #answer _NO_RESULTS_MESSAGE = 'Sorry, no results.' def __init__(self, filename): self.filename = filename self.engine = SearchEngine() @timed def load_books(self): processor = BookDataPreprocessor() with open(self.filename) as catalog: for entry in catalog: book_desc = processor.preprocess(entry) metadata = ' '.join(book_desc[self._BOOK_META_TITLE_INDEX:]) iid = book_desc[self._BOOK_META_ID_INDEX].strip() title = book_desc[self._BOOK_META_TITLE_INDEX].strip() author = book_desc[self._BOOK_META_AUTHOR_INDEX].strip() book = Book(iid, title, author, metadata) self.engine.add_object(book) self.engine.start() @timed def search_books(self, query, n_results=10): result = '' if len(query) > 0: result = self.engine.search(query, n_results) if len(result) > 0: return '\n'.join([str(indexable) for indexable in result]) return self._NO_RESULTS_MESSAGE def books_count(self): return self.engine.count()
class wordInventory(object): """Class representing a inventory of words. Args: filename (str): File name containing word inventory data. Attributes: filename (str): File name containing word inventory data. engine (SearchEngine): Object responsible for indexing word inventory data. """ _NO_RESULTS_MESSAGE = 'Sorry, no results.' def __init__(self, filename): self.filename = filename self.engine = SearchEngine() self.engine2 = SearchEngine() @timed def load_words(self): """Load words from a file name. This method leverages the iterable behavior of File objects that automatically uses buffered IO and memory management handling effectively large files. """ logger.info('Loading words from file...') iid = 1 for parent,dirnames,fnames in os.walk(self.filename): for fname in fnames: fname2 = './Reuters/' + fname # print fname word = open(fname2).read() # temp = fname.rstrip('.html').split('-') # if len(temp)<=1: # continue # singer = temp[0] # title = temp[1] # metadata = singer + ' ' + title # wordobject = Word(iid, title, singer,word) wordobject = Word(iid, word) # songobject = SongInfo(iid,title,singer,metadata) self.engine.add_object(wordobject) # self.engine2.add_object(songobject) iid+=1 self.engine.start() # self.engine2.start() self.saveToFile() @timed def search_words(self, query, n_results=10): """Search words according to provided query of terms. The query is executed against the indexed words, and a list of words compatible with the provided terms is return along with their tf-idf score. Args: query (str): Query string with one or more terms. n_results (int): Desired number of results. Returns: list of IndexableResult: List containing words and their respective tf-idf scores. """ result = '' # dictionary = self.engine.index.term_index.keys() if len(query) > 0: # checkSpelling(query, dictionary) result = self.engine.search(query, n_results) print result if len(result) > 0: # return '\n'.join([str(indexable) for indexable in result]) return return self._NO_RESULTS_MESSAGE # def search_info(self, query, n_results=10): # """Search song information according to provided query of terms. # The query is executed against the indexed words, and a list of words # compatible with the provided terms is return along with their tf-idf # score. # Args: # query (str): Query string with one or more terms. # n_results (int): Desired number of results. # Returns: # list of IndexableResult: List containing words and their respective # tf-idf scores. # """ # result = '' # if len(query) > 0: # result = self.engine2.search(query, n_results) # if len(result) > 0: # return '\n'.join([str(indexable) for indexable in result]) # return self._NO_RESULTS_MESSAGE def saveToFile(self): fileObject = open('test.engine','w') pickle.dump(self.engine, fileObject) def words_count(self): """ Returns: int: Number of words indexed. """ return self.engine.count()
class BookInventory(object): """Class representing a inventory of books. Args: filename (str): File name containing book inventory data. Attributes: filename (str): File name containing book inventory data. indexer (Indexer): Object responsible for indexing book inventory data. """ _BOOK_META_ID_INDEX = 0 _BOOK_META_TITLE_INDEX = 1 _BOOK_META_AUTHOR_INDEX = 2 _NO_RESULTS_MESSAGE = 'Sorry, no results.' def __init__(self, filename): self.filename = filename self.engine = SearchEngine() @timed def load_books(self): """Load books from a file name. This method leverages the iterable behavior of File objects that automatically uses buffered IO and memory management handling effectively large files. """ logger.info('Loading books from file...') processor = BookDataPreprocessor() with open(self.filename) as catalog: for entry in catalog: book_desc = processor.preprocess(entry) metadata = ' '.join(book_desc[self._BOOK_META_TITLE_INDEX:]) iid = book_desc[self._BOOK_META_ID_INDEX].strip() title = book_desc[self._BOOK_META_TITLE_INDEX].strip() author = book_desc[self._BOOK_META_AUTHOR_INDEX].strip() book = Book(iid, title, author, metadata) self.engine.add_object(book) self.engine.start() @timed def search_books(self, query, n_results=10): """Search books according to provided query of terms. The query is executed against the indexed books, and a list of books compatible with the provided terms is return along with their tf-idf score. Args: query (str): Query string with one or more terms. n_results (int): Desired number of results. Returns: list of IndexableResult: List containing books and their respective tf-idf scores. """ result = '' if len(query) > 0: result = self.engine.search(query, n_results) if len(result) > 0: return '\n'.join([str(indexable) for indexable in result]) return self._NO_RESULTS_MESSAGE def books_count(self): """Return number of books already in the index. Returns: int: Number of books indexed. """ return self.engine.count()
class SearchEngineTests(unittest.TestCase): """ Test case for SearchEngine class. """ def setUp(self): """ Setup search engine that will be subjected to the tests. """ self.engine = SearchEngine() def test_indexed_doc_count(self): """ Test if the number of indexed object is retrieved correctly. """ sample1 = Indexable(1, "this is an indexable metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable metadata") self.build_sample_index([sample1, sample2, sample3]) self.assertEqual(self.engine.count(), 3) def test_existent_term_search(self): """ Test if search is correctly performed. """ sample1 = Indexable(1, "this is an indexable metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable metadata") self.build_sample_index([sample1, sample2, sample3]) expected_results = [ IndexableResult(1.414214, sample1), IndexableResult(0.906589, sample2), IndexableResult(0.906589, sample3), ] results = self.engine.search("indexable metadata") self.assertListEqual(results, expected_results) def test_non_existent_term_search(self): """ Test if search is correctly performed. """ sample1 = Indexable(1, "this is an indexable metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable metadata") self.build_sample_index([sample1, sample2, sample3]) expected_results = [] results = self.engine.search("asdasdasdas") self.assertListEqual(results, expected_results) def test_search_result_limit(self): """ Test if search results can be limited. """ sample1 = Indexable(1, "this is an indexable metadata") sample2 = Indexable(2, "this is an indexable super metadata") sample3 = Indexable(3, "this is another indexable metadata") self.build_sample_index([sample1, sample2, sample3]) expected_results = [ IndexableResult(1.414214, sample1), ] results = self.engine.search("indexable metadata", 1) self.assertListEqual(results, expected_results) def build_sample_index(self, objects): for indexable in objects: self.engine.add_object(indexable) self.engine.start()
class wordInventory(object): """Class representing a inventory of words. Args: filename (str): File name containing word inventory data. Attributes: filename (str): File name containing word inventory data. engine (SearchEngine): Object responsible for indexing word inventory data. """ _NO_RESULTS_MESSAGE = 'Sorry, no results.' def __init__(self, filename): self.filename = filename self.engine = SearchEngine() # self.engine2 = SearchEngine() @timed def init_engine(self, isFromFile=True, isBinaryWord=False): """Load words from a file name. This method leverages the iterable behavior of File objects that automatically uses buffered IO and memory management handling effectively large files. """ # print isFromFile if isFromFile: self.loadFromeFile(isBinaryWord) else: logger.info('Loading words from file...') iid = 1 for parent, dirnames, fnames in os.walk(self.filename): for fname in fnames: fname2 = './Reuters/' + fname # print fname word = open(fname2).read() # temp = fname.rstrip('.html').split('-') # if len(temp)<=1: # continue # singer = temp[0] # title = temp[1] # metadata = singer + ' ' + title # wordobject = Word(iid, title, singer,word) wordobject = Word(iid, word, isBinaryWord) # songobject = SongInfo(iid,title,singer,metadata) self.engine.add_object(wordobject) # self.engine2.add_object(songobject) iid += 1 self.engine.start(isBinaryWord) # self.engine2.start() self.saveToFile(isBinaryWord) @timed def search_words(self, query, n_results=10, choice=2, SYSNONYM=False): """Search words according to provided query of terms. The query is executed against the indexed words, and a list of words compatible with the provided terms is return along with their tf-idf score. Args: query (str): Query string with one or more terms. n_results (int): Desired number of results. Returns: list of IndexableResult: List containing words and their respective tf-idf scores. """ result = '' # dictionary = self.engine.index.term_index.keys() if len(query) > 0: # checkSpelling(query, dictionary) parent, dirnames, fnames = list(os.walk(self.filename))[0] if choice == 1: result = self.engine.search_bool(query, n_results, SYSNONYM) for res in result: print res, " ", fnames[res] elif choice == 2: result = self.engine.search(query, n_results, SYSNONYM) for res in result: print res.indexable.iid - 1, " ", fnames[res.indexable.iid - 1], " ", res.score # print len(list(os.walk(self.filename))) # print if len(result) > 0: # return '\n'.join([str(indexable) for indexable in result]) return return self._NO_RESULTS_MESSAGE # def search_info(self, query, n_results=10): # """Search song information according to provided query of terms. # The query is executed against the indexed words, and a list of words # compatible with the provided terms is return along with their tf-idf # score. # Args: # query (str): Query string with one or more terms. # n_results (int): Desired number of results. # Returns: # list of IndexableResult: List containing words and their respective # tf-idf scores. # """ # result = '' # if len(query) > 0: # result = self.engine2.search(query, n_results) # if len(result) > 0: # return '\n'.join([str(indexable) for indexable in result]) # return self._NO_RESULTS_MESSAGE def saveToFile(self, isBinaryWord): if isBinaryWord: fileObject = open('test.engine', 'w') else: fileObject = open('test_noBinary.engine', 'w') pickle.dump(self.engine, fileObject) # @timed def loadFromeFile(self, isBinaryWord=False): # print isBinaryWord if isBinaryWord: fileObject = open('test.engine', 'r') else: fileObject = open('test_noBinary.engine', 'r') self.engine = pickle.load(fileObject) def words_count(self): """ Returns: int: Number of words indexed. """ return self.engine.count()
class SearchEngineTests(unittest.TestCase): """ Test case for SearchEngine class. """ def setUp(self): """ Setup search engine that will be subjected to the tests. """ self.engine = SearchEngine() def test_indexed_doc_count(self): """ Test if the number of indexed object is retrieved correctly. """ sample1 = Indexable(1, 'this is an indexable metadata') sample2 = Indexable(2, 'this is an indexable super metadata') sample3 = Indexable(3, 'this is another indexable metadata') self.build_sample_index([sample1, sample2, sample3]) self.assertEqual(self.engine.count(), 3) def test_existent_term_search(self): """ Test if search is correctly performed. """ sample1 = Indexable(1, 'this is an indexable metadata') sample2 = Indexable(2, 'this is an indexable super metadata') sample3 = Indexable(3, 'this is another indexable metadata') self.build_sample_index([sample1, sample2, sample3]) expected_results = [ IndexableResult(1.414214, sample1), IndexableResult(0.906589, sample2), IndexableResult(0.906589, sample3), ] results = self.engine.search('indexable metadata') self.assertListEqual(results, expected_results) def test_non_existent_term_search(self): """ Test if search is correctly performed. """ sample1 = Indexable(1, 'this is an indexable metadata') sample2 = Indexable(2, 'this is an indexable super metadata') sample3 = Indexable(3, 'this is another indexable metadata') self.build_sample_index([sample1, sample2, sample3]) expected_results = [] results = self.engine.search('asdasdasdas') self.assertListEqual(results, expected_results) def test_search_result_limit(self): """ Test if search results can be limited. """ sample1 = Indexable(1, 'this is an indexable metadata') sample2 = Indexable(2, 'this is an indexable super metadata') sample3 = Indexable(3, 'this is another indexable metadata') self.build_sample_index([sample1, sample2, sample3]) expected_results = [ IndexableResult(1.414214, sample1), ] results = self.engine.search('indexable metadata', 1) self.assertListEqual(results, expected_results) def build_sample_index(self, objects): for indexable in objects: self.engine.add_object(indexable) self.engine.start()