コード例 #1
0
class BookInventory(object):

    _BOOK_META_ID_INDEX = 0
    _BOOK_META_TITLE_INDEX = 1  #question
    _BOOK_META_AUTHOR_INDEX = 2  #answer
    _NO_RESULTS_MESSAGE = 'Sorry, no results.'

    def __init__(self, filename):
        self.filename = filename
        self.engine = SearchEngine()

    @timed
    def load_books(self):
        processor = BookDataPreprocessor()
        with open(self.filename) as catalog:
            for entry in catalog:
                book_desc = processor.preprocess(entry)
                metadata = ' '.join(book_desc[self._BOOK_META_TITLE_INDEX:])

                iid = book_desc[self._BOOK_META_ID_INDEX].strip()
                title = book_desc[self._BOOK_META_TITLE_INDEX].strip()
                author = book_desc[self._BOOK_META_AUTHOR_INDEX].strip()

                book = Book(iid, title, author, metadata)
                self.engine.add_object(book)

        self.engine.start()

    @timed
    def search_books(self, query, n_results=10):

        result = ''
        if len(query) > 0:
            result = self.engine.search(query, n_results)

        if len(result) > 0:
            return '\n'.join([str(indexable) for indexable in result])
        return self._NO_RESULTS_MESSAGE

    def books_count(self):
        return self.engine.count()
コード例 #2
0
class wordInventory(object):
    """Class representing a inventory of words.

    Args:
      filename (str): File name containing word inventory data.

    Attributes:
      filename (str): File name containing word inventory data.
      engine (SearchEngine): Object responsible for indexing word inventory data.

    """
    _NO_RESULTS_MESSAGE = 'Sorry, no results.'

    def __init__(self, filename):
        self.filename = filename
        self.engine = SearchEngine()
        self.engine2 = SearchEngine()

    @timed
    def load_words(self):
        """Load words from a file name.

        This method leverages the iterable behavior of File objects
        that automatically uses buffered IO and memory management handling
        effectively large files.

        """
        logger.info('Loading words from file...')
        iid =  1
        for parent,dirnames,fnames in os.walk(self.filename):
                for fname in fnames:
                    fname2 = './Reuters/' + fname
                    # print fname
                    word = open(fname2).read()
                    # temp = fname.rstrip('.html').split('-')
                    # if len(temp)<=1:
                        # continue
                    # singer = temp[0]
                    # title = temp[1]
                    # metadata = singer + ' ' + title

                    # wordobject = Word(iid, title, singer,word)
                    wordobject = Word(iid, word)
                    # songobject  = SongInfo(iid,title,singer,metadata)
                    self.engine.add_object(wordobject)
                    # self.engine2.add_object(songobject)
                    iid+=1

        self.engine.start()
        # self.engine2.start()
        self.saveToFile()

    @timed
    def search_words(self, query, n_results=10):
        """Search words according to provided query of terms.

        The query is executed against the indexed words, and a list of words
        compatible with the provided terms is return along with their tf-idf
        score.

        Args:
          query (str): Query string with one or more terms.
          n_results (int): Desired number of results.

        Returns:
          list of IndexableResult: List containing words and their respective
            tf-idf scores.

        """
        result = ''
        # dictionary = self.engine.index.term_index.keys()
        if len(query) > 0:
            # checkSpelling(query, dictionary)
            result = self.engine.search(query, n_results)
            print result

        if len(result) > 0:
            # return '\n'.join([str(indexable) for indexable in result])
            return
        return self._NO_RESULTS_MESSAGE

    # def search_info(self, query, n_results=10):
    #     """Search song information according to provided query of terms.

    #     The query is executed against the indexed words, and a list of words
    #     compatible with the provided terms is return along with their tf-idf
    #     score.

    #     Args:
    #       query (str): Query string with one or more terms.
    #       n_results (int): Desired number of results.

    #     Returns:
    #       list of IndexableResult: List containing words and their respective
    #         tf-idf scores.

    #     """
    #     result = ''
    #     if len(query) > 0:
    #         result = self.engine2.search(query, n_results)

    #     if len(result) > 0:
    #         return '\n'.join([str(indexable) for indexable in result])
    #     return self._NO_RESULTS_MESSAGE


    def saveToFile(self):
        fileObject = open('test.engine','w')
        pickle.dump(self.engine, fileObject)



    def words_count(self):
        """
        Returns:
          int: Number of words indexed.
        """
        return self.engine.count()
コード例 #3
0
ファイル: book.py プロジェクト: ahaldar/simple-search-engine
class BookInventory(object):
    """Class representing a inventory of books.

    Args:
      filename (str): File name containing book inventory data.

    Attributes:
      filename (str): File name containing book inventory data.
      indexer (Indexer): Object responsible for indexing book inventory data.

    """

    _BOOK_META_ID_INDEX = 0
    _BOOK_META_TITLE_INDEX = 1
    _BOOK_META_AUTHOR_INDEX = 2
    _NO_RESULTS_MESSAGE = 'Sorry, no results.'

    def __init__(self, filename):
        self.filename = filename
        self.engine = SearchEngine()

    @timed
    def load_books(self):
        """Load books from a file name.

        This method leverages the iterable behavior of File objects
        that automatically uses buffered IO and memory management handling
        effectively large files.

        """
        logger.info('Loading books from file...')
        processor = BookDataPreprocessor()
        with open(self.filename) as catalog:
            for entry in catalog:
                book_desc = processor.preprocess(entry)
                metadata = ' '.join(book_desc[self._BOOK_META_TITLE_INDEX:])

                iid = book_desc[self._BOOK_META_ID_INDEX].strip()
                title = book_desc[self._BOOK_META_TITLE_INDEX].strip()
                author = book_desc[self._BOOK_META_AUTHOR_INDEX].strip()

                book = Book(iid, title, author, metadata)
                self.engine.add_object(book)

        self.engine.start()

    @timed
    def search_books(self, query, n_results=10):
        """Search books according to provided query of terms.

        The query is executed against the indexed books, and a list of books
        compatible with the provided terms is return along with their tf-idf
        score.

        Args:
          query (str): Query string with one or more terms.
          n_results (int): Desired number of results.

        Returns:
          list of IndexableResult: List containing books and their respective
            tf-idf scores.

        """
        result = ''
        if len(query) > 0:
            result = self.engine.search(query, n_results)

        if len(result) > 0:
            return '\n'.join([str(indexable) for indexable in result])
        return self._NO_RESULTS_MESSAGE

    def books_count(self):
        """Return number of books already in the index.

        Returns:
          int: Number of books indexed.

        """
        return self.engine.count()
コード例 #4
0
class SearchEngineTests(unittest.TestCase):
    """
    Test case for SearchEngine class.
    """

    def setUp(self):
        """
        Setup search engine that will be subjected to the tests.
        """
        self.engine = SearchEngine()

    def test_indexed_doc_count(self):
        """
        Test if the number of indexed object is retrieved correctly.
        """
        sample1 = Indexable(1, "this is an indexable metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable metadata")
        self.build_sample_index([sample1, sample2, sample3])
        self.assertEqual(self.engine.count(), 3)

    def test_existent_term_search(self):
        """
        Test if search is correctly performed.
        """
        sample1 = Indexable(1, "this is an indexable metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable metadata")
        self.build_sample_index([sample1, sample2, sample3])

        expected_results = [
            IndexableResult(1.414214, sample1),
            IndexableResult(0.906589, sample2),
            IndexableResult(0.906589, sample3),
        ]

        results = self.engine.search("indexable metadata")
        self.assertListEqual(results, expected_results)

    def test_non_existent_term_search(self):
        """
        Test if search is correctly performed.
        """
        sample1 = Indexable(1, "this is an indexable metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable metadata")
        self.build_sample_index([sample1, sample2, sample3])

        expected_results = []

        results = self.engine.search("asdasdasdas")
        self.assertListEqual(results, expected_results)

    def test_search_result_limit(self):
        """
        Test if search results can be limited.
        """
        sample1 = Indexable(1, "this is an indexable metadata")
        sample2 = Indexable(2, "this is an indexable super metadata")
        sample3 = Indexable(3, "this is another indexable metadata")
        self.build_sample_index([sample1, sample2, sample3])

        expected_results = [
            IndexableResult(1.414214, sample1),
        ]

        results = self.engine.search("indexable metadata", 1)
        self.assertListEqual(results, expected_results)

    def build_sample_index(self, objects):
        for indexable in objects:
            self.engine.add_object(indexable)
        self.engine.start()
コード例 #5
0
class wordInventory(object):
    """Class representing a inventory of words.

    Args:
      filename (str): File name containing word inventory data.

    Attributes:
      filename (str): File name containing word inventory data.
      engine (SearchEngine): Object responsible for indexing word inventory data.

    """
    _NO_RESULTS_MESSAGE = 'Sorry, no results.'

    def __init__(self, filename):
        self.filename = filename
        self.engine = SearchEngine()
        # self.engine2 = SearchEngine()

    @timed
    def init_engine(self, isFromFile=True, isBinaryWord=False):
        """Load words from a file name.

        This method leverages the iterable behavior of File objects
        that automatically uses buffered IO and memory management handling
        effectively large files.

        """
        # print isFromFile
        if isFromFile:
            self.loadFromeFile(isBinaryWord)
        else:
            logger.info('Loading words from file...')
            iid = 1
            for parent, dirnames, fnames in os.walk(self.filename):
                for fname in fnames:
                    fname2 = './Reuters/' + fname
                    # print fname
                    word = open(fname2).read()
                    # temp = fname.rstrip('.html').split('-')
                    # if len(temp)<=1:
                    # continue
                    # singer = temp[0]
                    # title = temp[1]
                    # metadata = singer + ' ' + title

                    # wordobject = Word(iid, title, singer,word)
                    wordobject = Word(iid, word, isBinaryWord)
                    # songobject  = SongInfo(iid,title,singer,metadata)
                    self.engine.add_object(wordobject)
                    # self.engine2.add_object(songobject)
                    iid += 1

            self.engine.start(isBinaryWord)
            # self.engine2.start()
            self.saveToFile(isBinaryWord)

    @timed
    def search_words(self, query, n_results=10, choice=2, SYSNONYM=False):
        """Search words according to provided query of terms.

        The query is executed against the indexed words, and a list of words
        compatible with the provided terms is return along with their tf-idf
        score.

        Args:
          query (str): Query string with one or more terms.
          n_results (int): Desired number of results.

        Returns:
          list of IndexableResult: List containing words and their respective
            tf-idf scores.

        """
        result = ''
        # dictionary = self.engine.index.term_index.keys()
        if len(query) > 0:
            # checkSpelling(query, dictionary)
            parent, dirnames, fnames = list(os.walk(self.filename))[0]
            if choice == 1:
                result = self.engine.search_bool(query, n_results, SYSNONYM)
                for res in result:
                    print res, " ", fnames[res]
            elif choice == 2:
                result = self.engine.search(query, n_results, SYSNONYM)
                for res in result:
                    print res.indexable.iid - 1, " ", fnames[res.indexable.iid
                                                             -
                                                             1], " ", res.score
            # print len(list(os.walk(self.filename)))
            # print

        if len(result) > 0:
            # return '\n'.join([str(indexable) for indexable in result])
            return
        return self._NO_RESULTS_MESSAGE

    # def search_info(self, query, n_results=10):
    #     """Search song information according to provided query of terms.

    #     The query is executed against the indexed words, and a list of words
    #     compatible with the provided terms is return along with their tf-idf
    #     score.

    #     Args:
    #       query (str): Query string with one or more terms.
    #       n_results (int): Desired number of results.

    #     Returns:
    #       list of IndexableResult: List containing words and their respective
    #         tf-idf scores.

    #     """
    #     result = ''
    #     if len(query) > 0:
    #         result = self.engine2.search(query, n_results)

    #     if len(result) > 0:
    #         return '\n'.join([str(indexable) for indexable in result])
    #     return self._NO_RESULTS_MESSAGE

    def saveToFile(self, isBinaryWord):
        if isBinaryWord:
            fileObject = open('test.engine', 'w')
        else:
            fileObject = open('test_noBinary.engine', 'w')
        pickle.dump(self.engine, fileObject)

    # @timed
    def loadFromeFile(self, isBinaryWord=False):
        # print isBinaryWord
        if isBinaryWord:
            fileObject = open('test.engine', 'r')
        else:
            fileObject = open('test_noBinary.engine', 'r')
        self.engine = pickle.load(fileObject)

    def words_count(self):
        """
        Returns:
          int: Number of words indexed.
        """
        return self.engine.count()
コード例 #6
0
class BookInventory(object):
    """Class representing a inventory of books.

    Args:
      filename (str): File name containing book inventory data.

    Attributes:
      filename (str): File name containing book inventory data.
      indexer (Indexer): Object responsible for indexing book inventory data.

    """

    _BOOK_META_ID_INDEX = 0
    _BOOK_META_TITLE_INDEX = 1
    _BOOK_META_AUTHOR_INDEX = 2
    _NO_RESULTS_MESSAGE = 'Sorry, no results.'

    def __init__(self, filename):
        self.filename = filename
        self.engine = SearchEngine()

    @timed
    def load_books(self):
        """Load books from a file name.

        This method leverages the iterable behavior of File objects
        that automatically uses buffered IO and memory management handling
        effectively large files.

        """
        logger.info('Loading books from file...')
        processor = BookDataPreprocessor()
        with open(self.filename) as catalog:
            for entry in catalog:
                book_desc = processor.preprocess(entry)
                metadata = ' '.join(book_desc[self._BOOK_META_TITLE_INDEX:])

                iid = book_desc[self._BOOK_META_ID_INDEX].strip()
                title = book_desc[self._BOOK_META_TITLE_INDEX].strip()
                author = book_desc[self._BOOK_META_AUTHOR_INDEX].strip()

                book = Book(iid, title, author, metadata)
                self.engine.add_object(book)

        self.engine.start()

    @timed
    def search_books(self, query, n_results=10):
        """Search books according to provided query of terms.

        The query is executed against the indexed books, and a list of books
        compatible with the provided terms is return along with their tf-idf
        score.

        Args:
          query (str): Query string with one or more terms.
          n_results (int): Desired number of results.

        Returns:
          list of IndexableResult: List containing books and their respective
            tf-idf scores.

        """
        result = ''
        if len(query) > 0:
            result = self.engine.search(query, n_results)

        if len(result) > 0:
            return '\n'.join([str(indexable) for indexable in result])
        return self._NO_RESULTS_MESSAGE

    def books_count(self):
        """Return number of books already in the index.

        Returns:
          int: Number of books indexed.

        """
        return self.engine.count()
コード例 #7
0
class SearchEngineTests(unittest.TestCase):
    """
    Test case for SearchEngine class.
    """
    def setUp(self):
        """
        Setup search engine that will be subjected to the tests.
        """
        self.engine = SearchEngine()

    def test_indexed_doc_count(self):
        """
        Test if the number of indexed object is retrieved correctly.
        """
        sample1 = Indexable(1, 'this is an indexable metadata')
        sample2 = Indexable(2, 'this is an indexable super metadata')
        sample3 = Indexable(3, 'this is another indexable metadata')
        self.build_sample_index([sample1, sample2, sample3])
        self.assertEqual(self.engine.count(), 3)

    def test_existent_term_search(self):
        """
        Test if search is correctly performed.
        """
        sample1 = Indexable(1, 'this is an indexable metadata')
        sample2 = Indexable(2, 'this is an indexable super metadata')
        sample3 = Indexable(3, 'this is another indexable metadata')
        self.build_sample_index([sample1, sample2, sample3])

        expected_results = [
            IndexableResult(1.414214, sample1),
            IndexableResult(0.906589, sample2),
            IndexableResult(0.906589, sample3),
        ]

        results = self.engine.search('indexable metadata')
        self.assertListEqual(results, expected_results)

    def test_non_existent_term_search(self):
        """
        Test if search is correctly performed.
        """
        sample1 = Indexable(1, 'this is an indexable metadata')
        sample2 = Indexable(2, 'this is an indexable super metadata')
        sample3 = Indexable(3, 'this is another indexable metadata')
        self.build_sample_index([sample1, sample2, sample3])

        expected_results = []

        results = self.engine.search('asdasdasdas')
        self.assertListEqual(results, expected_results)

    def test_search_result_limit(self):
        """
        Test if search results can be limited.
        """
        sample1 = Indexable(1, 'this is an indexable metadata')
        sample2 = Indexable(2, 'this is an indexable super metadata')
        sample3 = Indexable(3, 'this is another indexable metadata')
        self.build_sample_index([sample1, sample2, sample3])

        expected_results = [
            IndexableResult(1.414214, sample1),
        ]

        results = self.engine.search('indexable metadata', 1)
        self.assertListEqual(results, expected_results)

    def build_sample_index(self, objects):
        for indexable in objects:
            self.engine.add_object(indexable)
        self.engine.start()