Exemple #1
0
def search():
    if request.method =='POST':
        searchTerm = request.form['searchTermInput']
        engine = SearchEngine()
        resultList = engine.search(searchTerm.lower())
        return render_template("search.html", resultList=resultList, listSize=len(resultList))
    return render_template("search.html")
Exemple #2
0
        filename = int(os.path.split(file)[1].split(".csv")[0])
        helpers.bulk(es, genData(snippets, filename))

    while True:

        print("\n\n0. Standard query")
        print("1. Allows positional indexing")
        print("2. Allows wildcard terms")
        print("3. Allows both wildcards and positional indexing")
        print("Type a query and mention the type of query. Ex - \"0, Standard query\" ")
        print("Ctrl + D to exit")

        queryType, query = [w.strip() for w in input().split(',')]

        start = time.time()
        res = engine.search(query, int(queryType))
        end = time.time()

        modified_output = {"took": end - start, "total": len(res), "hits": []}
        for doc_q in res:
            for doc in doc_q:
                res_file = os.path.join(dataPath, str(doc[0][0]) + ".csv")
                with open(res_file) as fd:
                    reader = csv.DictReader(fd)
                    
                    row_no = 0
                    for row in reader:
                        if row_no == doc[0][1] - 2:
                            current_row = row
                            break
                        row_no += 1
# test search engine methods

from searchEngine import SearchEngine
se = SearchEngine("index")

print(se.search("cristina lopes", 5))
print(se.search("machine learning", 5))
print(se.search("ACM", 5))
print(se.search("master of software engineering", 5))

# import pickle
#
# file = open("pIndex1.pkl", "rb")
# d = pickle.load(file)
# file.close()
#
# print(len(d))

# for k, v in d.items():
#     #print(k,v)
#     print(k, ": ", v)

# LINKED LIST OR SET UF POSTINGS?
# IF YOU USE A SET, YOU NEED TO IMPLEMENT THE __EQ__ ETC AND HASH

# USE STEMMING TO CUT DOWN ON # OF ENTRIES IN INDICES

# MERGING STRATEGY
# have an index for every letter
# create a partial index
# go through that sorted index and load to memory each letter
class Test(unittest.TestCase):
    def setUp(self):
        with open('test0.txt', 'w') as f:
            f.write('All we need is,\n all we need is,\n all we need is')
        with open('test1.txt', 'w') as f:
            f.write('Blood, blood,\n blood')
        with open('test2.txt', 'w') as f:
            f.write('All we need is, all we need is,\n all we need is')
        with open('test.txt', 'w') as f:
            f.write('All we need is, all we need is, all we need is')
        with open('testtest.txt', 'w') as f:
            f.write('Blood, blood, blood')
        with open('testtesttest.txt', 'w') as f:
            f.write('All we need is, all we need is,\n all we need is')
        with open('testSentence.txt', 'w') as f:
            f.write(
                'What do we need? All we need is blood. Pain pain pain pain')
        indexer = Indexator('TestDatabase')
        indexer.indexize('test0.txt')
        indexer.indexize('test1.txt')
        indexer.indexize('test2.txt')
        self.searchEngine = SearchEngine("TestDatabase")

    # unittests for search
    def test_input_type_number(self):
        with self.assertRaises(ValueError):
            self.searchEngine.search(13)

    def test_input_type_not_exists(self):
        self.assertEqual(self.searchEngine.search('вискас'), {})

    def test_we(self):
        expected = {
            'test0.txt': [
                indexator.Position(4, 6, 1),
                indexator.Position(5, 7, 2),
                indexator.Position(5, 7, 3)
            ],
            'test2.txt': [
                indexator.Position(4, 6, 1),
                indexator.Position(20, 22, 1),
                indexator.Position(5, 7, 2)
            ]
        }
        self.assertEqual(self.searchEngine.search('we'), expected)

    def test_blood(self):
        expected = {
            'test1.txt':
            [indexator.Position(7, 12, 1),
             indexator.Position(1, 6, 2)]
        }
        self.assertEqual(self.searchEngine.search("blood"), expected)

    # unittests for searchQuery
    def test__query_input_type_number(self):
        with self.assertRaises(ValueError):
            self.searchEngine.searchQuery(13)

    def test_query_input_type_not_exists(self):
        self.assertEqual(self.searchEngine.searchQuery('вискас'), {})

    def test_we_is(self):
        expected = {
            'test0.txt': [
                indexator.Position(4, 6, 1),
                indexator.Position(5, 7, 2),
                indexator.Position(5, 7, 3),
                indexator.Position(12, 14, 1),
                indexator.Position(13, 15, 2),
                indexator.Position(13, 15, 3)
            ],
            'test2.txt': [
                indexator.Position(4, 6, 1),
                indexator.Position(20, 22, 1),
                indexator.Position(5, 7, 2),
                indexator.Position(12, 14, 1),
                indexator.Position(28, 30, 1),
                indexator.Position(13, 15, 2)
            ]
        }
        self.assertEqual(self.searchEngine.searchQuery('we is'), expected)

    def test_need(self):
        expected = {
            'test0.txt': [
                indexator.Position(7, 11, 1),
                indexator.Position(8, 12, 2),
                indexator.Position(8, 12, 3)
            ],
            'test2.txt': [
                indexator.Position(7, 11, 1),
                indexator.Position(23, 27, 1),
                indexator.Position(8, 12, 2)
            ]
        }
        self.assertEqual(self.searchEngine.searchQuery('need'), expected)

    # unittests for contexts
    def test_context(self):
        pos = indexator.Position(20, 22, 1)
        context = searchEngine.ContextWindow.makeWindowGreatAgain(
            2, 'test.txt', pos)
        self.assertEqual(context.string, "is, all we need is")

    def test_context_line_not_exists(self):
        pos = indexator.Position(20, 22, 2)
        with self.assertRaises(ValueError):
            searchEngine.ContextWindow.makeWindowGreatAgain(2, 'test.txt', pos)

    def test_context_large_size(self):
        pos = indexator.Position(20, 22, 1)
        context = searchEngine.ContextWindow.makeWindowGreatAgain(
            8, 'test.txt', pos)
        self.assertEqual(context.string,
                         "All we need is, all we need is, all we need is")

    def test_context_zero_size(self):
        pos = indexator.Position(20, 22, 1)
        context = searchEngine.ContextWindow.makeWindowGreatAgain(
            0, 'test.txt', pos)
        self.assertEqual(context.string, "we")

    def test_context_two_windows(self):
        poss = [indexator.Position(20, 22, 1), indexator.Position(32, 35, 1)]
        contexts = [
            searchEngine.ContextWindow.makeWindowGreatAgain(
                2, 'test.txt', poss[0]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                2, 'test.txt', poss[1])
        ]
        contextUnion = searchEngine.ContextWindow().unionWindows(contexts)
        targetTokensPositions = [
            indexator.Position(20, 22, 1),
            indexator.Position(32, 35, 1)
        ]
        expected = searchEngine.ContextWindow.initWithData(
            "All we need is, all we need is, all we need is",
            targetTokensPositions, 43, 12, "is, all we need is, all we need",
            "test.txt", 1)
        expectedList = []
        expectedList.append(expected)
        self.assertEqual(contextUnion, expectedList)

    def test_context_many_windows(self):
        poss = [
            indexator.Position(20, 22, 1),
            indexator.Position(32, 35, 1),
            indexator.Position(7, 12, 1),
            indexator.Position(20, 22, 1),
            indexator.Position(28, 30, 1),
            indexator.Position(1, 4, 2)
        ]
        contexts = [
            searchEngine.ContextWindow.makeWindowGreatAgain(
                2, 'test.txt', poss[0]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                2, 'test.txt', poss[1]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                1, 'testtest.txt', poss[2]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                8, 'testtesttest.txt', poss[3]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                2, 'testtesttest.txt', poss[4]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                2, 'testtesttest.txt', poss[5])
        ]
        contextUnion = searchEngine.ContextWindow().unionWindows(contexts)

        targetTokensPositions1 = [
            indexator.Position(20, 22, 1),
            indexator.Position(32, 35, 1)
        ]
        expected1 = searchEngine.ContextWindow.initWithData(
            "All we need is, all we need is, all we need is",
            targetTokensPositions1, 43, 12, "is, all we need is, all we need",
            "test.txt", 1)

        targetTokensPositions2 = [indexator.Position(7, 12, 1)]
        expected2 = searchEngine.ContextWindow.initWithData(
            "Blood, blood, blood", targetTokensPositions2, 19, 0,
            "Blood, blood, blood", "testtest.txt", 1)

        targetTokensPositions3 = [
            indexator.Position(20, 22, 1),
            indexator.Position(28, 30, 1)
        ]
        expected3 = searchEngine.ContextWindow.initWithData(
            "All we need is, all we need is,\n", targetTokensPositions3, 30, 0,
            "All we need is, all we need is", "testtesttest.txt", 1)

        targetTokensPositions4 = [indexator.Position(1, 4, 2)]
        expected4 = searchEngine.ContextWindow.initWithData(
            " all we need is", targetTokensPositions4, 12, 1, "all we need",
            "testtesttest.txt", 2)

        expectedList = []
        expectedList.append(expected1)
        expectedList.append(expected2)
        expectedList.append(expected3)
        expectedList.append(expected4)
        self.assertEqual(contextUnion, expectedList)

    def test_context_expand_to_sentence(self):
        pos = indexator.Position(24, 28, 1)
        context = searchEngine.ContextWindow.makeWindowGreatAgain(
            1, 'testSentence.txt', pos)
        context.expandToSentence()
        targetTokensPositions = [indexator.Position(24, 28, 1)]
        expected = searchEngine.ContextWindow.initWithData(
            "What do we need? All we need is blood. Pain pain pain pain",
            targetTokensPositions, 38, 17, "All we need is blood.",
            "testSentence.txt", 1)
        self.assertEqual(context, expected)

    def test_context_expand_to_sentence_two_tokens(self):
        poss = [indexator.Position(21, 23, 1), indexator.Position(24, 28, 1)]
        contexts = [
            searchEngine.ContextWindow.makeWindowGreatAgain(
                1, 'testSentence.txt', poss[0]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                1, 'testSentence.txt', poss[1])
        ]
        contextUnion = searchEngine.ContextWindow().unionWindows(contexts)
        contextUnion[0].expandToSentence()
        context = contextUnion[0]
        targetTokensPositions = [
            indexator.Position(21, 23, 1),
            indexator.Position(24, 28, 1)
        ]
        expected = searchEngine.ContextWindow.initWithData(
            "What do we need? All we need is blood. Pain pain pain pain",
            targetTokensPositions, 38, 17, "All we need is blood.",
            "testSentence.txt", 1)
        self.assertEqual(context, expected)

    # def test_query_context(self):
    #     expected = {
    #         'test.txt': [
    #             indexator.Position(4, 6, 1),
    #             indexator.Position(5, 7, 2),
    #             indexator.Position(5, 7, 3),
    #             indexator.Position(12, 14, 1),
    #             indexator.Position(13, 15, 2),
    #             indexator.Position(13, 15, 3)],
    #         'test2.txt': [
    #             indexator.Position(4, 6, 1),
    #             indexator.Position(20, 22, 1),
    #             indexator.Position(5, 7, 2),
    #             indexator.Position(12, 14, 1),
    #             indexator.Position(28, 30, 1),
    #             indexator.Position(13, 15, 2)]}
    #     print(searchEngine.ContextWindow.makeWindowGreatAgain(
    #         3, 'test0.txt', indexator.Position(12, 14, 1),))
    #     self.assertEqual(self.searchEngine.searchQueryWindow('blood pain', 3), expected)

    def tearDown(self):
        self.searchEngine.__del__()
        files = os.listdir(path=".")
        for file in files:
            if file.startswith('TestDatabase'):
                os.remove(file)
            if file.startswith('test'):
                os.remove(file)