Esempio n. 1
0
    def __init__(self):
        db = get_session()
        truncate_db(db)

        config = ProjectConfig("A:/Development/magistrska/DependencyDiff/configs/fri.json", "A:/Development/magistrska/DependencyDiff/configs/fri.results.json")

        for old, new in zip(self.examples, self.examples[1::]):
            self.old_hash, self.old_url = old
            self.new_hash, self.new_url = new

            old_page = Page.get_or_create(db, self.project_name, self.old_hash, "https://www.fri.uni-lj.si/en/")
            old_page.url = self.old_url
            new_page = Page.get_or_create(db, self.project_name, self.new_hash, self.new_url)

            old_crawler = Crawler(old_page, "443")
            new_crawler = Crawler(new_page, "443")

            old_crawler.get_page(db, old_page, config, self.old_hash)
            new_crawler.get_page(db, new_page, config, self.new_hash)

            old_content = old_page.contents[0].content if len(old_page.contents) > 0 else ""
            new_content = new_page.contents[0].content if len(new_page.contents) > 0 else ""

            compare_result = Compare.compare(old_content, new_content)

            if compare_result:
                element_diff = Compare.extract_differences(compare_result)
                for element in element_diff:
                    old_diff = old_crawler.screenshot(element)
                    new_diff = new_crawler.screenshot(element)
                    DbDiff.get_or_create(db, old_page.id, new_page.id, element, old_diff[1], new_diff[1], old_diff[0],
                                         new_diff[0])

            for old_action in old_page.actions:
                for new_action in new_page.actions:
                    if old_action.element == new_action.element and old_action.type == new_action.type:
                        old_action_content = old_action.content
                        new_action_content = new_action.content

                        if old_content != old_action_content or new_content != new_action_content:

                            compare_result = Compare.compare(old_action_content, new_action_content)
                            if compare_result:
                                element_diff = Compare.extract_differences(compare_result)

                                if element_diff:
                                    old_crawler.visit_and_action(old_page, old_action, config)
                                    new_crawler.visit_and_action(new_page, new_action, config)
                                    for element in element_diff:
                                        old_diff = old_crawler.screenshot(element)
                                        new_diff = new_crawler.screenshot(element)
                                        DbDiff.get_or_create(db, old_page.id, new_page.id, element, old_diff[1],
                                                             new_diff[1], old_diff[0], new_diff[0], new_action.id)

            old_page.url = "https://www.fri.uni-lj.si/en/"
            new_page.url = "https://www.fri.uni-lj.si/en/"

            db.commit()
Esempio n. 2
0
def start():
    es = create_es_connection(
        config['database']['host'],
        config['database']['port'],
        config['database']['access_key'],
        config['database']['secret_key'],
        config['database']['region'])
    threads = []
    for search in config['searches']:
        ed = ElasticDriver(es, search['name'])
        i = 1
        for twitterAccount in search['twitterAccounts']:
            td = TwitterDriver(
                search['keywords'],
                ed,
                search['sensitivity'],
                twitterAccount['consumer_key'],
                twitterAccount['consumer_secret'],
                twitterAccount['access_token_key'],
                twitterAccount['access_token_secret']
            )
            threads.append(Crawler(f'{search["name"]}-{i}', search['keywords'], es, td))
            i += 1

    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
Esempio n. 3
0
 def __init__(self, coursecode, connection, cycle, term, requisitiontype):
     self.coursecode = coursecode
     self.crawler = Crawler()
     self.connection = connection
     self.cycle = cycle
     self.term = term
     self.requisitiontype = requisitiontype
Esempio n. 4
0
 def __init__(self):
     self.connection = MySQLConnection()
     self.timeperiod = None
     self.cycle = None
     self.coursereader = None
     self.offerreader = None
     self.crawler = Crawler()
     self.offerreader = OfferReader(self.connection, self.cycle,
                                    self.timeperiod)
Esempio n. 5
0
 def testItReturnsTheCrawledUrls(self):
     crawler = Crawler()
     urlsToCrawl = ['http://google.se', 'http://aftonbladet.se']
     for url in urlsToCrawl:
         crawler.add_to_crawl(url)
     result = crawler.crawl()
     self.assertEquals(
         urlsToCrawl, result,
         'Not all urls that was supposed to be crawled was crawled.')
Esempio n. 6
0
    def test_crawler_recurses(self):
        # Arrange
        html = """
<html><body><a href="http://testurl.com/testpage.html">Link text</a></body></html>
"""
        initial_url = 'http://www.initialurl.com/'

        mock_urllib = create_autospec(urllib2)
        crawler = Crawler(mock_urllib)

        # Act
        crawler.crawl([initial_url])

        # Assert
        expected_calls = [
            call.urlopen(initial_url),
            call.urlopen('http://testurl.com/testpage.html')
        ]
        mock_urllib.assert_has_calls(expected_calls)
Esempio n. 7
0
 def __init__(self, connection, cycle, timeperiod):
     self.connection = connection
     self.timeperiod = timeperiod
     self.cycle = cycle
     self.crawler = Crawler()
     self.course = None
Esempio n. 8
0
 def __init__(self):
     self.connection = None
     self.timeperiod = None
     self.faculty = None
     self.crawler = Crawler()
Esempio n. 9
0
    def compare_pages(self):
        success = self.setup_project(self.old.hash)
        if not success:
            print(f"failed diff deploying {self.old.hash}")
            return
        success = self.setup_project(self.new.hash)
        if not success:
            print(f"failed diff deploying {self.new.hash}")
            return

        for old in self.old.pages:
            for new in self.new.pages:
                if new.url == old.url:
                    print("diff pages: ", new.url, old.url)
                    old_crawler = None
                    new_crawler = None
                    old_content = old.contents[0].content if len(
                        old.contents) > 0 else ""
                    new_content = new.contents[0].content if len(
                        new.contents) > 0 else ""

                    exists = PageDiff.exists(self.db, old.id, new.id)

                    if not exists:
                        compare_result = Compare.compare(
                            old_content, new_content)
                        if compare_result:
                            PageDiff.get_or_create(self.db, old.id, new.id,
                                                   compare_result)

                            element_diff = Compare.extract_differences(
                                compare_result)

                            if element_diff:
                                old_crawler = Crawler(
                                    Page.get_or_create(
                                        self.db,
                                        self.project_config.project_name,
                                        self.old.hash,
                                        Url.clean_url(Constants.DOCKER_URL)),
                                    self.projects[self.old.hash].port)
                                new_crawler = Crawler(
                                    Page.get_or_create(
                                        self.db,
                                        self.project_config.project_name,
                                        self.new.hash,
                                        Url.clean_url(Constants.DOCKER_URL)),
                                    self.projects[self.new.hash].port)

                                old_crawler.visit_page(old,
                                                       self.project_config)
                                new_crawler.visit_page(new,
                                                       self.project_config)

                                for element in element_diff:
                                    old_diff = old_crawler.screenshot(element)
                                    new_diff = new_crawler.screenshot(element)
                                    DbDiff.get_or_create(
                                        self.db, old.id, new.id, element,
                                        old_diff[1], new_diff[1], old_diff[0],
                                        new_diff[0])

                    if old_crawler is None:
                        old_crawler = Crawler(
                            Page.get_or_create(
                                self.db, self.project_config.project_name,
                                self.old.hash,
                                Url.clean_url(Constants.DOCKER_URL)),
                            self.projects[self.old.hash].port)
                    if new_crawler is None:
                        new_crawler = Crawler(
                            Page.get_or_create(
                                self.db, self.project_config.project_name,
                                self.new.hash,
                                Url.clean_url(Constants.DOCKER_URL)),
                            self.projects[self.new.hash].port)

                    visited = False
                    for old_action in self.windowed_query(
                            old.actions, Action.id, 1000):
                        for new_action in self.windowed_query(
                                new.actions, Action.id, 10):
                            if old_action.element == new_action.element and old_action.type == new_action.type:
                                old_action_content = old_action.content
                                new_action_content = new_action.content

                                if old_content != old_action_content or new_content != new_action_content:
                                    exists = DbDiff.exists(
                                        self.db, old.id, new.id, new_action.id)
                                    if not exists:
                                        compare_result = Compare.compare(
                                            old_action_content,
                                            new_action_content)
                                        if compare_result:
                                            element_diff = Compare.extract_differences(
                                                compare_result)

                                            if element_diff:
                                                old_crawler.visit_and_action(
                                                    old, old_action,
                                                    self.project_config)
                                                new_crawler.visit_and_action(
                                                    old, new_action,
                                                    self.project_config)
                                                for element in element_diff:
                                                    old_diff = old_crawler.screenshot(
                                                        element)
                                                    new_diff = new_crawler.screenshot(
                                                        element)
                                                    DbDiff.get_or_create(
                                                        self.db, old.id,
                                                        new.id, element,
                                                        old_diff[1],
                                                        new_diff[1],
                                                        old_diff[0],
                                                        new_diff[0],
                                                        new_action.id)

        return True
Esempio n. 10
0
 def testItCanCrawl(self):
     crawler = Crawler()
     crawler.add_to_crawl('http://google.se')
     crawler.crawl()