Esempio n. 1
0
def check_by_input_file(input_name):
    input_file = os.path.join("test/data/scorer", input_name)

    test_data = yaml.load(open(input_file).read())
    scorer = Scorer(test_data['input'])
    scores = scorer.calculate_scores()

    expected_scores = test_data['scores']

    assert scores == expected_scores, "Incorrect scores for '{0}'.".format(input_name)
Esempio n. 2
0
def test_invalid_data():
    something = object()
    input_ = { "TLA1": something, "TLA2": something }

    error  = Exception('bacon')
    def checker(tla, data):
        assert data is something, "Wrong data passed to validator"
        raise error

    with mock.patch('scorer.validate_team', create=True) as mock_validate:

        mock_validate.side_effect = checker

        threw = False
        try:
            scorer = Scorer(input_)
            actual = scorer.calculate_scores()
        except Exception as e:
            threw = True
            assert e is error

        assert threw, "Should have experienced an error from the validator"
STOP_WORDS = ['d01', 'd02', 'd03', 'd04', 'd05', 'd06', 'd07', 'd08',  
'a', 'also', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'do',
'for', 'have', 'is', 'in', 'it', 'of', 'or', 'see', 'so',
'that', 'the', 'this', 'to', 'we']


crawler = Crawler([urljoin(SEED_URL, page) for page in SEED_PAGES])

page_rank = PageRank(crawler.webgraph_in, crawler.webgraph_out)
page_rank.build_graph()

index = Indexer(crawler.contents, STOP_WORDS)
index.build_index()

scorer = Scorer(index)

print("> SIMPLE SEARCH ENGINE (by Tammo, Tim & Flo)")

while True:
    scores = scorer.calculate_scores(input("\n> query: "))

    if not scores:
        print("your search term does not occur on any page")
        continue

    ranked_scores = [(url, score, page_rank.get_rank(url), score * page_rank.get_rank(url)) for url, score in scores.items()]
    
    print("\n               url | score  | rank   | rank * score\n" + "-" * 54)
    for url, score, rank, ranked_score in sorted(ranked_scores, key=lambda element: element[3], reverse=True):
        print(" ..{} | {:.4f} | {:.4f} | {:.4f}".format(url[-15:], round(score, 6), round(rank, 6), round(ranked_score, 6)))
d08_len = index.documents_length['http://mysql12.f4.htw-berlin.de/crawl/d08.html']
print("  d08 length:  " + ("OK" if round(d08_len, 6) == 2.727447 else "WRONG"))

d06_len = index.documents_length['http://mysql12.f4.htw-berlin.de/crawl/d06.html']
print("  d06 length:  " + ("OK" if round(d06_len, 6) == 1.974093 else "WRONG"))

d04_len = index.documents_length['http://mysql12.f4.htw-berlin.de/crawl/d04.html']
print("  d04 length:  " + ("OK" if round(d04_len, 6) == 4.312757 else "WRONG"))


print("\n# Scorer TEST")

scorer = Scorer(index)

tokens_scores = scorer.calculate_scores('tokens')
tokens_scores_check = all(
    ((round(tokens_scores['http://mysql12.f4.htw-berlin.de/crawl/d08.html'], 6) == 0.119897),
     (round(tokens_scores['http://mysql12.f4.htw-berlin.de/crawl/d02.html'], 6) == 0.093106),
     (round(tokens_scores['http://mysql12.f4.htw-berlin.de/crawl/d04.html'], 6) == 0.061577),
     (round(tokens_scores['http://mysql12.f4.htw-berlin.de/crawl/d01.html'], 6) == 0.051784),
     (round(tokens_scores['http://mysql12.f4.htw-berlin.de/crawl/d03.html'], 6) == 0.045677)))
print("  'tokens' score:  " + ("OK" if tokens_scores_check else "WRONG"))

index_scores = scorer.calculate_scores('index')
index_scores_check = all(
    ((round(index_scores['http://mysql12.f4.htw-berlin.de/crawl/d08.html'], 6) == 0.250207),
     (round(index_scores['http://mysql12.f4.htw-berlin.de/crawl/d05.html'], 6) == 0.233073),
     (round(index_scores['http://mysql12.f4.htw-berlin.de/crawl/d04.html'], 6) == 0.098769)))
print("  'index' score:  " + ("OK" if index_scores_check else "WRONG"))