def __init__(self): db = get_session() truncate_db(db) config = ProjectConfig("A:/Development/magistrska/DependencyDiff/configs/fri.json", "A:/Development/magistrska/DependencyDiff/configs/fri.results.json") for old, new in zip(self.examples, self.examples[1::]): self.old_hash, self.old_url = old self.new_hash, self.new_url = new old_page = Page.get_or_create(db, self.project_name, self.old_hash, "https://www.fri.uni-lj.si/en/") old_page.url = self.old_url new_page = Page.get_or_create(db, self.project_name, self.new_hash, self.new_url) old_crawler = Crawler(old_page, "443") new_crawler = Crawler(new_page, "443") old_crawler.get_page(db, old_page, config, self.old_hash) new_crawler.get_page(db, new_page, config, self.new_hash) old_content = old_page.contents[0].content if len(old_page.contents) > 0 else "" new_content = new_page.contents[0].content if len(new_page.contents) > 0 else "" compare_result = Compare.compare(old_content, new_content) if compare_result: element_diff = Compare.extract_differences(compare_result) for element in element_diff: old_diff = old_crawler.screenshot(element) new_diff = new_crawler.screenshot(element) DbDiff.get_or_create(db, old_page.id, new_page.id, element, old_diff[1], new_diff[1], old_diff[0], new_diff[0]) for old_action in old_page.actions: for new_action in new_page.actions: if old_action.element == new_action.element and old_action.type == new_action.type: old_action_content = old_action.content new_action_content = new_action.content if old_content != old_action_content or new_content != new_action_content: compare_result = Compare.compare(old_action_content, new_action_content) if compare_result: element_diff = Compare.extract_differences(compare_result) if element_diff: old_crawler.visit_and_action(old_page, old_action, config) new_crawler.visit_and_action(new_page, new_action, config) for element in element_diff: old_diff = old_crawler.screenshot(element) new_diff = new_crawler.screenshot(element) DbDiff.get_or_create(db, old_page.id, new_page.id, element, old_diff[1], new_diff[1], old_diff[0], new_diff[0], new_action.id) old_page.url = "https://www.fri.uni-lj.si/en/" new_page.url = "https://www.fri.uni-lj.si/en/" db.commit()
def start(): es = create_es_connection( config['database']['host'], config['database']['port'], config['database']['access_key'], config['database']['secret_key'], config['database']['region']) threads = [] for search in config['searches']: ed = ElasticDriver(es, search['name']) i = 1 for twitterAccount in search['twitterAccounts']: td = TwitterDriver( search['keywords'], ed, search['sensitivity'], twitterAccount['consumer_key'], twitterAccount['consumer_secret'], twitterAccount['access_token_key'], twitterAccount['access_token_secret'] ) threads.append(Crawler(f'{search["name"]}-{i}', search['keywords'], es, td)) i += 1 for thread in threads: thread.start() for thread in threads: thread.join()
def __init__(self, coursecode, connection, cycle, term, requisitiontype): self.coursecode = coursecode self.crawler = Crawler() self.connection = connection self.cycle = cycle self.term = term self.requisitiontype = requisitiontype
def __init__(self): self.connection = MySQLConnection() self.timeperiod = None self.cycle = None self.coursereader = None self.offerreader = None self.crawler = Crawler() self.offerreader = OfferReader(self.connection, self.cycle, self.timeperiod)
def testItReturnsTheCrawledUrls(self): crawler = Crawler() urlsToCrawl = ['http://google.se', 'http://aftonbladet.se'] for url in urlsToCrawl: crawler.add_to_crawl(url) result = crawler.crawl() self.assertEquals( urlsToCrawl, result, 'Not all urls that was supposed to be crawled was crawled.')
def test_crawler_recurses(self): # Arrange html = """ <html><body><a href="http://testurl.com/testpage.html">Link text</a></body></html> """ initial_url = 'http://www.initialurl.com/' mock_urllib = create_autospec(urllib2) crawler = Crawler(mock_urllib) # Act crawler.crawl([initial_url]) # Assert expected_calls = [ call.urlopen(initial_url), call.urlopen('http://testurl.com/testpage.html') ] mock_urllib.assert_has_calls(expected_calls)
def __init__(self, connection, cycle, timeperiod): self.connection = connection self.timeperiod = timeperiod self.cycle = cycle self.crawler = Crawler() self.course = None
def __init__(self): self.connection = None self.timeperiod = None self.faculty = None self.crawler = Crawler()
def compare_pages(self): success = self.setup_project(self.old.hash) if not success: print(f"failed diff deploying {self.old.hash}") return success = self.setup_project(self.new.hash) if not success: print(f"failed diff deploying {self.new.hash}") return for old in self.old.pages: for new in self.new.pages: if new.url == old.url: print("diff pages: ", new.url, old.url) old_crawler = None new_crawler = None old_content = old.contents[0].content if len( old.contents) > 0 else "" new_content = new.contents[0].content if len( new.contents) > 0 else "" exists = PageDiff.exists(self.db, old.id, new.id) if not exists: compare_result = Compare.compare( old_content, new_content) if compare_result: PageDiff.get_or_create(self.db, old.id, new.id, compare_result) element_diff = Compare.extract_differences( compare_result) if element_diff: old_crawler = Crawler( Page.get_or_create( self.db, self.project_config.project_name, self.old.hash, Url.clean_url(Constants.DOCKER_URL)), self.projects[self.old.hash].port) new_crawler = Crawler( Page.get_or_create( self.db, self.project_config.project_name, self.new.hash, Url.clean_url(Constants.DOCKER_URL)), self.projects[self.new.hash].port) old_crawler.visit_page(old, self.project_config) new_crawler.visit_page(new, self.project_config) for element in element_diff: old_diff = old_crawler.screenshot(element) new_diff = new_crawler.screenshot(element) DbDiff.get_or_create( self.db, old.id, new.id, element, old_diff[1], new_diff[1], old_diff[0], new_diff[0]) if old_crawler is None: old_crawler = Crawler( Page.get_or_create( self.db, self.project_config.project_name, self.old.hash, Url.clean_url(Constants.DOCKER_URL)), self.projects[self.old.hash].port) if new_crawler is None: new_crawler = Crawler( Page.get_or_create( self.db, self.project_config.project_name, self.new.hash, Url.clean_url(Constants.DOCKER_URL)), self.projects[self.new.hash].port) visited = False for old_action in self.windowed_query( old.actions, Action.id, 1000): for new_action in self.windowed_query( new.actions, Action.id, 10): if old_action.element == new_action.element and old_action.type == new_action.type: old_action_content = old_action.content new_action_content = new_action.content if old_content != old_action_content or new_content != new_action_content: exists = DbDiff.exists( self.db, old.id, new.id, new_action.id) if not exists: compare_result = Compare.compare( old_action_content, new_action_content) if compare_result: element_diff = Compare.extract_differences( compare_result) if element_diff: old_crawler.visit_and_action( old, old_action, self.project_config) new_crawler.visit_and_action( old, new_action, self.project_config) for element in element_diff: old_diff = old_crawler.screenshot( element) new_diff = new_crawler.screenshot( element) DbDiff.get_or_create( self.db, old.id, new.id, element, old_diff[1], new_diff[1], old_diff[0], new_diff[0], new_action.id) return True
def testItCanCrawl(self): crawler = Crawler() crawler.add_to_crawl('http://google.se') crawler.crawl()