Ejemplo n.º 1
0
from crawler import Crawler
from indexer import Indexer
from query_processor import QuerryProcessor
from document import Document
from time import sleep

if __name__ == '__main__':

    # sleep(5.0)
    # print("THREAD-TIME!")
    crawler = Crawler('https://www.in.gr', 20, 5, True, 'BFS')
    crawler.initializeCrawl()

    ind = Indexer(Crawler.documents)

    query = input("Enter your search query:")
    ind.add_document(Document('search_query', query))
    print('Building Indexer...')
    ind.create_indexer()
    print('Calculating TF-IDFs. May take a while.')
    ind.calculate_scores()

    qp = QuerryProcessor(ind.inverted_index, len(ind.documents))
    docs_with_cos_ = qp.compare_documents()
    docs_with_cos_ = sorted(
        docs_with_cos_, key=lambda x: x[1],
        reverse=True)  # sorting based on cosine similarity scores
    print(f'Showing top results based on your query "{query}":')
    for doc in docs_with_cos_:
        print(doc[0].link)
Ejemplo n.º 2
0
frontier = Frontier([
    'http://mysql12.f4.htw-berlin.de/crawl/d01.html',
    'http://mysql12.f4.htw-berlin.de/crawl/d06.html',
    'http://mysql12.f4.htw-berlin.de/crawl/d08.html'
])
parser = Parser()
indexer = Indexer()
web_graph = Graph()

for url in frontier:
    # get outgoing links for the graph and content for tokenization
    body, links_on_page = parser.parse(url)

    # add document to indexer
    indexer.add_document(url, body)

    # build our webgraph
    node = web_graph.get_node(url)
    if node is None:
        node = web_graph.add_node(url)

    for out_link in links_on_page:
        web_graph.add_edge(url, out_link)

    # hand links to the frontier to make sure they are all crawled
    frontier.add_urls(links_on_page)

# for node in web_graph:
#     print(node)
#