예제 #1
0
파일: search.py 프로젝트: saulcasti/SIW
def main(args):
    indexer = Indexer()
    open_func = gzip.open if args.zip else io.open
    index_ext = ".json.gz" if args.zip else ".json"
    with open_func(args.index + index_ext) as f:
        indexer.load(f)

    with io.open(args.texts, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            bag = BagOfWords(line, filter_stopwords=False)
            print(">" * 80)
            print("Query:\n    {}".format(bag.text))
            print(">" * 80)
            for result, score in indexer.search(bag, args.limit):
                print("{}:\n    {}".format(
                    score, result[0:args.show].encode("utf-8")))
            print("<" * 80)
            print("\n\n")
    return 0
예제 #2
0
class TestSearch(unittest.TestCase):
    """
    Esta prueba usa el siguiente ejemplo como modelo
    https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/
    """

    texts = [
        "The game of life is a game of everlasting learning",
        "The unexamined life is not worth living", "Never stop learning"
    ]
    indexer = None

    def setUp(self):
        self.indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(text,
                             enable_stemming=False,
                             filter_stopwords=False)
            self.indexer.index(bag)

    def test_search_1(self):
        bag = BagOfWords("life learning",
                         enable_stemming=False,
                         filter_stopwords=False)
        results = sorted(((text, round(score, 5))
                          for text, score in self.indexer.search(bag, 10)),
                         reverse=True,
                         key=lambda x: x[1])

        self.assertEqual(len(results), 3)
        result0, _ = results[0]
        self.assertEqual(result0, self.texts[0])
        result1, _ = results[1]
        self.assertEqual(result1, self.texts[1])
        result2, _ = results[2]
        self.assertEqual(result2, self.texts[2])

    def test_search_2(self):
        bag = BagOfWords("learning",
                         enable_stemming=False,
                         filter_stopwords=False)
        results = sorted(((text, round(score, 5))
                          for text, score in self.indexer.search(bag, 10)),
                         reverse=True,
                         key=lambda x: x[1])

        self.assertEqual(len(results), 2)
        result0, _ = results[0]
        self.assertEqual(result0, self.texts[0])
        result1, _ = results[1]
        self.assertEqual(result1, self.texts[2])

    def test_search_3(self):
        bag = BagOfWords("ñu life",
                         enable_stemming=False,
                         filter_stopwords=False)
        results = sorted(((text, round(score, 5))
                          for text, score in self.indexer.search(bag, 10)),
                         reverse=True,
                         key=lambda x: x[1])

        self.assertEqual(len(results), 2)
        result0, _ = results[0]
        self.assertEqual(result0, self.texts[1])
        result1, _ = results[1]
        self.assertEqual(result1, self.texts[0])

    def test_search_4(self):
        bag = BagOfWords("life ñu",
                         enable_stemming=False,
                         filter_stopwords=False)
        results = sorted(((text, round(score, 5))
                          for text, score in self.indexer.search(bag, 10)),
                         reverse=True,
                         key=lambda x: x[1])

        self.assertEqual(len(results), 2)
        result0, _ = results[0]
        self.assertEqual(result0, self.texts[1])
        result1, _ = results[1]
        self.assertEqual(result1, self.texts[0])

    def test_search_5(self):
        bag = BagOfWords("foo bar",
                         enable_stemming=False,
                         filter_stopwords=False)
        results = sorted(((text, round(score, 5))
                          for text, score in self.indexer.search(bag, 10)),
                         reverse=True,
                         key=lambda x: x[1])

        self.assertEqual(len(results), 0)