Beispiel #1
0
def search():
    if request.method =='POST':
        searchTerm = request.form['searchTermInput']
        engine = SearchEngine()
        resultList = engine.search(searchTerm.lower())
        return render_template("search.html", resultList=resultList, listSize=len(resultList))
    return render_template("search.html")
Beispiel #2
0
def index():
    if request.method =='POST':
        startingURL = request.form['urlInput']
        engine = SearchEngine()
        numPagesIndexed, numWordsIndexed = engine.indexFrom(startingURL, MAX_DEPTH)
        return render_template("index.html", numPagesIndexed=numPagesIndexed, 
            numWordsIndexed=numWordsIndexed)
    return render_template("index.html")
Beispiel #3
0
def main():
    searchEn = SearchEngine()
    searchEn.startSearchEngine()
    window = sg.Window("My Search Engine", layout)
    while True:
        event, values = window.read()
        if event is None:
            searchEn.closeConnection()
            break
        if event == "search":
            searchEn.searchInterface(values["IN"])
    window.close()
 def setUp(self):
     with open('test0.txt', 'w') as f:
         f.write('All we need is,\n all we need is,\n all we need is')
     with open('test1.txt', 'w') as f:
         f.write('Blood, blood,\n blood')
     with open('test2.txt', 'w') as f:
         f.write('All we need is, all we need is,\n all we need is')
     with open('test.txt', 'w') as f:
         f.write('All we need is, all we need is, all we need is')
     with open('testtest.txt', 'w') as f:
         f.write('Blood, blood, blood')
     with open('testtesttest.txt', 'w') as f:
         f.write('All we need is, all we need is,\n all we need is')
     with open('testSentence.txt', 'w') as f:
         f.write(
             'What do we need? All we need is blood. Pain pain pain pain')
     indexer = Indexator('TestDatabase')
     indexer.indexize('test0.txt')
     indexer.indexize('test1.txt')
     indexer.indexize('test2.txt')
     self.searchEngine = SearchEngine("TestDatabase")
def main():
    print("Start crawling, please wait")
    crawler = WebCrawler("https://s2.smu.edu/~fmoore/index.htm", 200)
    crawler.setup()
    crawler.crawl()
    crawler.buildTFMatrix()
    crawler.printTFMatrix()
    crawler.topNWords(20)
    crawler.printInfo()
    
    print("Crawling completed, results are save to result.txt and tf_matrix.csv")
    print("Starting query search")

    engine = SearchEngine(crawler)
    engine.loadThesaurus("thesaurus.csv")
    while True:
        query = input("Please input query or stop to terminate query search:")
        # convert to lower case
        query = query.lower()
        if query == "stop":
            print("Thanks for using!")
            break
        engine.engine(query)
        print("Done")
        print('+++++++++++++++++++++++++++++++++++++++++')
Beispiel #6
0
def genData(snippets, filename):
    for row, snippet in enumerate(snippets):
        yield {
            "_index": "news_prog",
            "id": (filename, row + 2),
            "snippet": snippet
        }

if __name__ == "__main__":
    dirPath = os.path.dirname(os.path.realpath(__file__))
    dataPath = os.path.realpath(os.path.join(dirPath, "data"))
    files = [os.path.join(dataPath, file)
                for file in sorted(os.listdir(dataPath))]

    # Our Search Engine
    engine = SearchEngine()

    # Elasticsearch
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])

    for file in files:
        snippets = getSnippets(file)
        filename = int(os.path.split(file)[1].split(".csv")[0])
        helpers.bulk(es, genData(snippets, filename))

    while True:

        print("\n\n0. Standard query")
        print("1. Allows positional indexing")
        print("2. Allows wildcard terms")
        print("3. Allows both wildcards and positional indexing")
Beispiel #7
0
from invertedIndex import InvertedIndex
import datetime
from searchEngine import SearchEngine
from pathlib import Path
import json

#x = datetime.datetime.now()
'''
invertIndex = InvertedIndex("DEV")
invertIndex.readIndex(r"DEV")
'''
#y = datetime.datetime.now()

search = SearchEngine("DEV")
search.searchInterfaceCommandLine()
Beispiel #8
0
# from rss import Rss
#
# module = Rss('chosun')
# module.scrap()

# from crawler import Crawler
#
# module = Crawler('chosun')
# result = module.scrap('http://news.chosun.com/site/data/html_dir/2018/10/08/2018100802172.html')
# print(result)

from searchEngine import SearchEngine

module = SearchEngine('chosun')
module.do_search('화재')
Beispiel #9
0
import urllib, urllib2, requests
import markupsafe
import os, json, datetime, logging

import model

from webapp2_extras import sessions
from google.appengine.api import channel

from searchimagery import ScriptEngine
from searchEngine import SearchEngine
from geolocation import Geolocation
from YoutubeSearch import YtubeSearch

geolocation = Geolocation()
searchEngine = SearchEngine()
dbWrapper = model.DataStoreWrapper


class QueryHandler(webapp2.RequestHandler):
    def dispatch(self):
        # Get a session store for this request.
        self.session_store = sessions.get_store(request=self.request)
        try:
            # Dispatch the request.
            webapp2.RequestHandler.dispatch(self)
        finally:
            # Save all sessions.
            self.session_store.save_sessions(self.response)

    def getSession(self):
from flask import Flask, render_template, request, redirect
from searchEngine import SearchEngine

app = Flask(__name__)
se = SearchEngine("index")


@app.route("/")
def redir():
    return render_template("index.html")


# POST REQUEST TO GET RESULTS
# CHECK IF USER ACTUALLY SUBMITS A QUERY LATER!
@app.route("/results", methods=['POST', 'GET'])
def getResults():
    if request.method == "POST":
        query = request.form["query"]
        results = se.search(query, 5)
        return render_template("results.html", results=results)


@app.route("/back")
def goBack():
    return render_template("index.html")


# main function
if __name__ == "__main__":
    app.run()
# test search engine methods

from searchEngine import SearchEngine
se = SearchEngine("index")

print(se.search("cristina lopes", 5))
print(se.search("machine learning", 5))
print(se.search("ACM", 5))
print(se.search("master of software engineering", 5))

# import pickle
#
# file = open("pIndex1.pkl", "rb")
# d = pickle.load(file)
# file.close()
#
# print(len(d))

# for k, v in d.items():
#     #print(k,v)
#     print(k, ": ", v)

# LINKED LIST OR SET UF POSTINGS?
# IF YOU USE A SET, YOU NEED TO IMPLEMENT THE __EQ__ ETC AND HASH

# USE STEMMING TO CUT DOWN ON # OF ENTRIES IN INDICES

# MERGING STRATEGY
# have an index for every letter
# create a partial index
# go through that sorted index and load to memory each letter
class Test(unittest.TestCase):
    def setUp(self):
        with open('test0.txt', 'w') as f:
            f.write('All we need is,\n all we need is,\n all we need is')
        with open('test1.txt', 'w') as f:
            f.write('Blood, blood,\n blood')
        with open('test2.txt', 'w') as f:
            f.write('All we need is, all we need is,\n all we need is')
        with open('test.txt', 'w') as f:
            f.write('All we need is, all we need is, all we need is')
        with open('testtest.txt', 'w') as f:
            f.write('Blood, blood, blood')
        with open('testtesttest.txt', 'w') as f:
            f.write('All we need is, all we need is,\n all we need is')
        with open('testSentence.txt', 'w') as f:
            f.write(
                'What do we need? All we need is blood. Pain pain pain pain')
        indexer = Indexator('TestDatabase')
        indexer.indexize('test0.txt')
        indexer.indexize('test1.txt')
        indexer.indexize('test2.txt')
        self.searchEngine = SearchEngine("TestDatabase")

    # unittests for search
    def test_input_type_number(self):
        with self.assertRaises(ValueError):
            self.searchEngine.search(13)

    def test_input_type_not_exists(self):
        self.assertEqual(self.searchEngine.search('вискас'), {})

    def test_we(self):
        expected = {
            'test0.txt': [
                indexator.Position(4, 6, 1),
                indexator.Position(5, 7, 2),
                indexator.Position(5, 7, 3)
            ],
            'test2.txt': [
                indexator.Position(4, 6, 1),
                indexator.Position(20, 22, 1),
                indexator.Position(5, 7, 2)
            ]
        }
        self.assertEqual(self.searchEngine.search('we'), expected)

    def test_blood(self):
        expected = {
            'test1.txt':
            [indexator.Position(7, 12, 1),
             indexator.Position(1, 6, 2)]
        }
        self.assertEqual(self.searchEngine.search("blood"), expected)

    # unittests for searchQuery
    def test__query_input_type_number(self):
        with self.assertRaises(ValueError):
            self.searchEngine.searchQuery(13)

    def test_query_input_type_not_exists(self):
        self.assertEqual(self.searchEngine.searchQuery('вискас'), {})

    def test_we_is(self):
        expected = {
            'test0.txt': [
                indexator.Position(4, 6, 1),
                indexator.Position(5, 7, 2),
                indexator.Position(5, 7, 3),
                indexator.Position(12, 14, 1),
                indexator.Position(13, 15, 2),
                indexator.Position(13, 15, 3)
            ],
            'test2.txt': [
                indexator.Position(4, 6, 1),
                indexator.Position(20, 22, 1),
                indexator.Position(5, 7, 2),
                indexator.Position(12, 14, 1),
                indexator.Position(28, 30, 1),
                indexator.Position(13, 15, 2)
            ]
        }
        self.assertEqual(self.searchEngine.searchQuery('we is'), expected)

    def test_need(self):
        expected = {
            'test0.txt': [
                indexator.Position(7, 11, 1),
                indexator.Position(8, 12, 2),
                indexator.Position(8, 12, 3)
            ],
            'test2.txt': [
                indexator.Position(7, 11, 1),
                indexator.Position(23, 27, 1),
                indexator.Position(8, 12, 2)
            ]
        }
        self.assertEqual(self.searchEngine.searchQuery('need'), expected)

    # unittests for contexts
    def test_context(self):
        pos = indexator.Position(20, 22, 1)
        context = searchEngine.ContextWindow.makeWindowGreatAgain(
            2, 'test.txt', pos)
        self.assertEqual(context.string, "is, all we need is")

    def test_context_line_not_exists(self):
        pos = indexator.Position(20, 22, 2)
        with self.assertRaises(ValueError):
            searchEngine.ContextWindow.makeWindowGreatAgain(2, 'test.txt', pos)

    def test_context_large_size(self):
        pos = indexator.Position(20, 22, 1)
        context = searchEngine.ContextWindow.makeWindowGreatAgain(
            8, 'test.txt', pos)
        self.assertEqual(context.string,
                         "All we need is, all we need is, all we need is")

    def test_context_zero_size(self):
        pos = indexator.Position(20, 22, 1)
        context = searchEngine.ContextWindow.makeWindowGreatAgain(
            0, 'test.txt', pos)
        self.assertEqual(context.string, "we")

    def test_context_two_windows(self):
        poss = [indexator.Position(20, 22, 1), indexator.Position(32, 35, 1)]
        contexts = [
            searchEngine.ContextWindow.makeWindowGreatAgain(
                2, 'test.txt', poss[0]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                2, 'test.txt', poss[1])
        ]
        contextUnion = searchEngine.ContextWindow().unionWindows(contexts)
        targetTokensPositions = [
            indexator.Position(20, 22, 1),
            indexator.Position(32, 35, 1)
        ]
        expected = searchEngine.ContextWindow.initWithData(
            "All we need is, all we need is, all we need is",
            targetTokensPositions, 43, 12, "is, all we need is, all we need",
            "test.txt", 1)
        expectedList = []
        expectedList.append(expected)
        self.assertEqual(contextUnion, expectedList)

    def test_context_many_windows(self):
        poss = [
            indexator.Position(20, 22, 1),
            indexator.Position(32, 35, 1),
            indexator.Position(7, 12, 1),
            indexator.Position(20, 22, 1),
            indexator.Position(28, 30, 1),
            indexator.Position(1, 4, 2)
        ]
        contexts = [
            searchEngine.ContextWindow.makeWindowGreatAgain(
                2, 'test.txt', poss[0]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                2, 'test.txt', poss[1]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                1, 'testtest.txt', poss[2]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                8, 'testtesttest.txt', poss[3]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                2, 'testtesttest.txt', poss[4]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                2, 'testtesttest.txt', poss[5])
        ]
        contextUnion = searchEngine.ContextWindow().unionWindows(contexts)

        targetTokensPositions1 = [
            indexator.Position(20, 22, 1),
            indexator.Position(32, 35, 1)
        ]
        expected1 = searchEngine.ContextWindow.initWithData(
            "All we need is, all we need is, all we need is",
            targetTokensPositions1, 43, 12, "is, all we need is, all we need",
            "test.txt", 1)

        targetTokensPositions2 = [indexator.Position(7, 12, 1)]
        expected2 = searchEngine.ContextWindow.initWithData(
            "Blood, blood, blood", targetTokensPositions2, 19, 0,
            "Blood, blood, blood", "testtest.txt", 1)

        targetTokensPositions3 = [
            indexator.Position(20, 22, 1),
            indexator.Position(28, 30, 1)
        ]
        expected3 = searchEngine.ContextWindow.initWithData(
            "All we need is, all we need is,\n", targetTokensPositions3, 30, 0,
            "All we need is, all we need is", "testtesttest.txt", 1)

        targetTokensPositions4 = [indexator.Position(1, 4, 2)]
        expected4 = searchEngine.ContextWindow.initWithData(
            " all we need is", targetTokensPositions4, 12, 1, "all we need",
            "testtesttest.txt", 2)

        expectedList = []
        expectedList.append(expected1)
        expectedList.append(expected2)
        expectedList.append(expected3)
        expectedList.append(expected4)
        self.assertEqual(contextUnion, expectedList)

    def test_context_expand_to_sentence(self):
        pos = indexator.Position(24, 28, 1)
        context = searchEngine.ContextWindow.makeWindowGreatAgain(
            1, 'testSentence.txt', pos)
        context.expandToSentence()
        targetTokensPositions = [indexator.Position(24, 28, 1)]
        expected = searchEngine.ContextWindow.initWithData(
            "What do we need? All we need is blood. Pain pain pain pain",
            targetTokensPositions, 38, 17, "All we need is blood.",
            "testSentence.txt", 1)
        self.assertEqual(context, expected)

    def test_context_expand_to_sentence_two_tokens(self):
        poss = [indexator.Position(21, 23, 1), indexator.Position(24, 28, 1)]
        contexts = [
            searchEngine.ContextWindow.makeWindowGreatAgain(
                1, 'testSentence.txt', poss[0]),
            searchEngine.ContextWindow.makeWindowGreatAgain(
                1, 'testSentence.txt', poss[1])
        ]
        contextUnion = searchEngine.ContextWindow().unionWindows(contexts)
        contextUnion[0].expandToSentence()
        context = contextUnion[0]
        targetTokensPositions = [
            indexator.Position(21, 23, 1),
            indexator.Position(24, 28, 1)
        ]
        expected = searchEngine.ContextWindow.initWithData(
            "What do we need? All we need is blood. Pain pain pain pain",
            targetTokensPositions, 38, 17, "All we need is blood.",
            "testSentence.txt", 1)
        self.assertEqual(context, expected)

    # def test_query_context(self):
    #     expected = {
    #         'test.txt': [
    #             indexator.Position(4, 6, 1),
    #             indexator.Position(5, 7, 2),
    #             indexator.Position(5, 7, 3),
    #             indexator.Position(12, 14, 1),
    #             indexator.Position(13, 15, 2),
    #             indexator.Position(13, 15, 3)],
    #         'test2.txt': [
    #             indexator.Position(4, 6, 1),
    #             indexator.Position(20, 22, 1),
    #             indexator.Position(5, 7, 2),
    #             indexator.Position(12, 14, 1),
    #             indexator.Position(28, 30, 1),
    #             indexator.Position(13, 15, 2)]}
    #     print(searchEngine.ContextWindow.makeWindowGreatAgain(
    #         3, 'test0.txt', indexator.Position(12, 14, 1),))
    #     self.assertEqual(self.searchEngine.searchQueryWindow('blood pain', 3), expected)

    def tearDown(self):
        self.searchEngine.__del__()
        files = os.listdir(path=".")
        for file in files:
            if file.startswith('TestDatabase'):
                os.remove(file)
            if file.startswith('test'):
                os.remove(file)