Ejemplo n.º 1
0
 def setUp(self):
     self.indexer = Indexer()
     for text in self.texts:
         text = text.strip()
         bag = BagOfWords(text,
                          enable_stemming=False,
                          filter_stopwords=False)
         self.indexer.index(bag)
Ejemplo n.º 2
0
    def test_load(self):
        indexer = Indexer()
        fd = StringIO()
        json.dump(self.expected, fd)
        fd.seek(0)
        indexer.load(fd)
        got = indexer.to_dict()

        self.assertSequenceEqual(self.expected["docs_index"],
                                 got["docs_index"])
        self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
Ejemplo n.º 3
0
def test_create_index(dictdoc):
    print 'start'
    indexer = Indexer(index_name)
    indexer.indexDict(dictdoc)
    indexer.commit()
    indexer.close()
    print 'end create index'
Ejemplo n.º 4
0
def delete_index(dictdoc):
    print 'start delete index'
    indexer = Indexer(index_name)
    indexer.indexDict_delete(dictdoc)
    indexer.commit()
    indexer.close()
    print 'end create indexz'
Ejemplo n.º 5
0
    def test_index_creation(self):
        self.maxDiff = None

        indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(text,
                             enable_stemming=False,
                             filter_stopwords=False)
            indexer.index(bag)
        got = indexer.to_dict()

        self.assertSequenceEqual(self.expected["docs_index"],
                                 got["docs_index"])
        self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
Ejemplo n.º 6
0
def main(args):
    indexer = Indexer()
    open_func = gzip.open if args.zip else io.open
    index_ext = ".json.gz" if args.zip else ".json"
    with open_func(args.index + index_ext) as f:
        indexer.load(f)

    with io.open(args.texts, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            bag = BagOfWords(line, filter_stopwords=False)
            print(">" * 80)
            print("Query:\n    {}".format(bag.text))
            print(">" * 80)
            for result, score in indexer.search(bag, args.limit):
                print("{}:\n    {}".format(
                    score, result[0:args.show].encode("utf-8")))
            print("<" * 80)
            print("\n\n")
    return 0
Ejemplo n.º 7
0
def main(args):
    indexer = Indexer()

    with io.open(args.texts, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            bag = BagOfWords(line, filter_stopwords=False)
            indexer.index(bag)
    open_func = gzip.open if args.zip else io.open
    index_ext = ".json.gz" if args.zip else ".json"
    with open_func(args.index + index_ext, mode="wb") as f:
        indexer.dump(f)
    return 0
Ejemplo n.º 8
0
    def test_dump(self):
        indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(text,
                             enable_stemming=False,
                             filter_stopwords=False)
            indexer.index(bag)
        fd = StringIO()
        indexer.dump(fd)
        fd.seek(0)
        got = json.load(fd)

        self.assertSequenceEqual(self.expected["docs_index"],
                                 got["docs_index"])
        self.assertDictEqual(self.expected["terms_index"], got["terms_index"])
Ejemplo n.º 9
0
class TestSearch(unittest.TestCase):
    """
    Esta prueba usa el siguiente ejemplo como modelo
    https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/
    """

    texts = [
        "The game of life is a game of everlasting learning",
        "The unexamined life is not worth living", "Never stop learning"
    ]
    indexer = None

    def setUp(self):
        self.indexer = Indexer()
        for text in self.texts:
            text = text.strip()
            bag = BagOfWords(text,
                             enable_stemming=False,
                             filter_stopwords=False)
            self.indexer.index(bag)

    def test_search_1(self):
        bag = BagOfWords("life learning",
                         enable_stemming=False,
                         filter_stopwords=False)
        results = sorted(((text, round(score, 5))
                          for text, score in self.indexer.search(bag, 10)),
                         reverse=True,
                         key=lambda x: x[1])

        self.assertEqual(len(results), 3)
        result0, _ = results[0]
        self.assertEqual(result0, self.texts[0])
        result1, _ = results[1]
        self.assertEqual(result1, self.texts[1])
        result2, _ = results[2]
        self.assertEqual(result2, self.texts[2])

    def test_search_2(self):
        bag = BagOfWords("learning",
                         enable_stemming=False,
                         filter_stopwords=False)
        results = sorted(((text, round(score, 5))
                          for text, score in self.indexer.search(bag, 10)),
                         reverse=True,
                         key=lambda x: x[1])

        self.assertEqual(len(results), 2)
        result0, _ = results[0]
        self.assertEqual(result0, self.texts[0])
        result1, _ = results[1]
        self.assertEqual(result1, self.texts[2])

    def test_search_3(self):
        bag = BagOfWords("ñu life",
                         enable_stemming=False,
                         filter_stopwords=False)
        results = sorted(((text, round(score, 5))
                          for text, score in self.indexer.search(bag, 10)),
                         reverse=True,
                         key=lambda x: x[1])

        self.assertEqual(len(results), 2)
        result0, _ = results[0]
        self.assertEqual(result0, self.texts[1])
        result1, _ = results[1]
        self.assertEqual(result1, self.texts[0])

    def test_search_4(self):
        bag = BagOfWords("life ñu",
                         enable_stemming=False,
                         filter_stopwords=False)
        results = sorted(((text, round(score, 5))
                          for text, score in self.indexer.search(bag, 10)),
                         reverse=True,
                         key=lambda x: x[1])

        self.assertEqual(len(results), 2)
        result0, _ = results[0]
        self.assertEqual(result0, self.texts[1])
        result1, _ = results[1]
        self.assertEqual(result1, self.texts[0])

    def test_search_5(self):
        bag = BagOfWords("foo bar",
                         enable_stemming=False,
                         filter_stopwords=False)
        results = sorted(((text, round(score, 5))
                          for text, score in self.indexer.search(bag, 10)),
                         reverse=True,
                         key=lambda x: x[1])

        self.assertEqual(len(results), 0)
Ejemplo n.º 10
0
def create_index(index_name, dictdoc):
    indexer = Indexer(index_name)
    indexer.indexDict(dictdoc)
    indexer.commit()
    indexer.close()
Ejemplo n.º 11
0
def delete_index(index_name, dictdoc):
    indexer = Indexer(index_name)
    indexer.indexDict_delete(dictdoc)
    indexer.commit()
    indexer.close()