def _parse_link_from_db(self, link): url = Url(link['url']['url']) url.abstract_url = link['url']['abstract_url'] url.depth_of_finding = link['url']['depth_of_finding'] result = Link(url, link['dom_address'], link['html_id'], link['html_class']) return result
def __runDetector(self, id, url, is_new): craigslistSite = requests.get(url) search = bs4.BeautifulSoup(craigslistSite.text, 'html.parser') links = search.find_all("a") matchFound = False for link in links: textFormat = str(link) if ("data-id=" not in textFormat or "sandiego" not in textFormat): continue matchFound = True beginningIndex = textFormat.find("href=") + 6 endingIndex = textFormat.find(".html") + 5 hyperlink = textFormat[beginningIndex:endingIndex] if (is_new == True): inputData = {} inputData["hyperlink"] = hyperlink inputData["keywords"] = id inputData["is_deleted"] = False if Url.find_url_by_hyperlink(inputData['hyperlink'], inputData['keywords']): return Url(**inputData).save() if not matchFound: DetectorManager.__job_scheduler.add_job( self.__runDetector, 'date', run_date=self.__getRuntime(), args=(id, url, is_new))
def get_asyncrequest_to_id(self, current_session, async_id): raw_data = self.async_requests.find_one({ "session": current_session, "_id": async_id }) if raw_data is None: return None raw_structure = self.async_request_structure.find_one({ "session": current_session, "request_hash": raw_data['request_hash'] }) structure = AsyncRequestStructure(raw_structure['request_hash'], raw_structure['parameters']) url = Url(raw_data['url']['url']) url.abstract_url = raw_data['url']['abstract_url'] if "event" in raw_data: tmp = TimingRequest(raw_data['method'], url, None, raw_data['event'], parameters=raw_data['parameters']) tmp.request_structure = structure else: trigger = self.clickables.find_one({"_id": raw_data['trigger']}) trigger = self._parse_clickable_from_db_to_model(trigger) tmp = AjaxRequest(raw_data['method'], url, trigger, parameters=raw_data['parameters']) tmp.request_structure = structure return tmp
def _parse_form_from_db(self, form): parameters = [] for p in form['parameters']: form_input = FormInput(p['tag'], p['name'], p['input_type'], p['values']) parameters.append(form_input) action = Url(form['action']["url"]) action.abstract_url = form['action']["abstract_url"] return HtmlForm(parameters, action, form['method'], form["dom_address"])
def post(self): data = ApiUrl.parser.parse_args() print('you are here') url = Url(data['login_url'], data['visit_url']) try: url.save_to_db() except: return {'message' : 'Failed to save'}, 500 return url.json(), 201
def test_url_set(self): url1 = Url(TEST_URL1, depth_of_finding=3) url2 = Url(TEST_URL2, depth_of_finding=25) self.database.insert_url_into_db(SESSION, url1) self.assertEqual(self.database.urls.count(), 1) self.database.insert_url_into_db(SESSION, url1) self.assertEqual(self.database.urls.count(), 1) self.database.insert_url_into_db(SESSION, url2) self.assertEqual(self.database.urls.count(), 2)
def _url_collector(entry_url): url = Url(entry_url) url_list = url.get_urls_from_entry_url() if url_list is None: return None data = {'entry_url': url.url, 'url_list': url_list} data = db.create_document(data) return data
def handle_url(self, new_url, requested_url): if not isinstance(new_url, Url): new_url = Url(new_url) if requested_url is not None: if not isinstance(requested_url, Url): requested_url = Url(requested_url) new_url.abstract_url = self.calculate_abstract_url(new_url, requested_url) if not self.database_manager.url_exists(new_url): new_url.url_structure = self.calculate_url_structure(new_url) return new_url
def test_url_visit(self): url1 = Url(TEST_URL1, depth_of_finding=3) url2 = Url(TEST_URL2, depth_of_finding=25) self.database.insert_url_into_db(SESSION, url1) self.database.insert_url_into_db(SESSION, url2) url3 = self.database.get_next_url_for_crawling(SESSION) self.database.visit_url(SESSION, url3, 25, 200) url4 = self.database.get_next_url_for_crawling(SESSION) self.assertEqual(url1, url3) self.assertEqual(url2, url4)
def post_url(): data = request.get_json() url = Url(user_id=get_user(), **data) db.session.add(url) db.session.commit() return jsonify(urls=[str(url)])
def get_asyncrequest_to_id(self, current_session, async_id): raw_data = self.async_requests.find_one({"session": current_session, "_id": async_id}) if raw_data is None: return None raw_structure = self.async_request_structure.find_one({"session": current_session, "request_hash": raw_data['request_hash']}) structure = AsyncRequestStructure(raw_structure['request_hash'], raw_structure['parameters']) url = Url(raw_data['url']['url']) url.abstract_url = raw_data['url']['abstract_url'] if "event" in raw_data: tmp = TimingRequest(raw_data['method'], url, None, raw_data['event'], parameters=raw_data['parameters']) tmp.request_structure = structure else: trigger = self.clickables.find_one({"_id": raw_data['trigger']}) trigger = self._parse_clickable_from_db_to_model(trigger) tmp = AjaxRequest(raw_data['method'], url, trigger, parameters=raw_data['parameters']) tmp.request_structure = structure return tmp
def put(self, id): data = ApiUrl.parser.parse_args() url = Url.find_by_id(id) if url is None: url = Url(data['login_url'], data['visit_url']) else: url.login_url = data['login_url'] url.visit_url = data['visit_url'] url.save_to_db() return url.json()
def get(self): inputData = UrlController.__getParser.parse_args() return { 'urls': list( map(lambda x: x.json(), Url.find_url_by_keywords(inputData['keywords']))) }
def bfs(starting_url): queue = [starting_url] while queue: url_string = queue.pop(0) try: if not is_url_visited(url_string): url = Url(url_string=url_string) add_entry_to_db(url) domain = create_domain(url_string=url_string) add_entry_to_db(domain) queue.extend(get_urls_and_add_server(url_string)) except Exception as e: pass
def parse_row(row): if len(row) > 0: arguments = row.split(',') # arguments are : # argument[0] = latitude # argument[1] = longitude # argument[2] = list of urls arguments[2] = arguments[2][1:-1] urls = arguments[2].split('|') return Url(arguments[0], arguments[1], urls) else: return None
def add_webpage_to_cluster(self, webpage): url = Url(webpage.url) clusters = self._persistence_manager.get_clusters(url.url_hash) if clusters is None: #self._clusters[url.url_hash] = [webpage.id] self._persistence_manager.write_clusters(url.url_hash, [webpage.id]) else: tmp = [] for c in clusters: if isinstance(c, list): tmp.extend(c) else: tmp.append(c) tmp.append(webpage.id) new_clusters = self.hierarchical_clustering(tmp, CLUSTER_THRESHOLD) for c in new_clusters: if isinstance( c, int ): # Konvert integer to list, so mongo store all seperate single clusters in its own lists. new_clusters.remove(c) new_clusters.insert(0, [c]) self._persistence_manager.write_clusters(url.url_hash, new_clusters)
def attack_single_url(self, url, replacement=False): if not replacement: attack_url = url result, response_code = self._xss.attack(attack_url, "123") logging.debug("Result: {}".format(result)) return url = Url(url) for parameter_to_attack in url.parameters: for vector in self._xss_vector.attack_vectors: attack_url = url.scheme + "://" + url.domain + url.path + "?" random_val = self._xss_vector.random_number_generator(12) ramdom_val = "123" for other_parameters in url.parameters: if parameter_to_attack == other_parameters: attack_url += other_parameters + "=" + vector.replace( "XSS", random_val) + "&" else: attack_url += other_parameters + "=" + url.parameters[ other_parameters][0] + "&" attack_url = attack_url[:-1] # Removing the last "& logging.debug("Attack with: {}".format(attack_url)) result, response_code = self._xss.attack( attack_url, random_val) logging.debug("Result: {}".format(result))
def createUrl(self, url): result = Url(url) self.session.add(result) self.session.commit() self._actions.append(lambda: self._urlCreated(url)) return result
def delete(self): inputData = UrlController.__deleteParser.parse_args() url = Url.find_url_by_id(inputData['id']) url.deleteUrl() return {"message": "Posting deleted Successfully."}, 201
def getUrl(self): return Url.find_by_id(Url, self.url_id)
def get(self, id=None): url = Url.find_by_id(self, id) if url: return url.json() return {'message' : 'Logindata not found'}, 404
def delete(self, id): url = Url.find_by_id() if url: url.delete_from_db()
def test_url_set_and_get(self): url = Url(TEST_URL1, depth_of_finding=3) self.database.insert_url_into_db(SESSION, url) url2 = self.database.get_next_url_for_crawling(SESSION) self.assertEqual(url, url2) self.assertEqual(url2.depth_of_finding, 3)
def get(self): urls = Url.find_all(Url) return {'urls' : [Url.json(myInput) for myInput in urls]}
def _parse_url_from_db(self, url): result = Url(url['url'], url['depth_of_finding']) result.parameters = self.unescape_unloved_signs(result.parameters) result.abstract_url = url["abstract_url"] return result
from unittest import TestCase import mock from models.url import Url url = Url('www.google.com.br') class MockResponse: def __init__(self, json_data, status_code): self.json_data = json_data self.status_code = status_code def json(self): return self.json_data def text(self): return self.json_data class UrlTests(TestCase): @staticmethod def test_set_url_with_sanitize(): assert url.url == 'http://www.google.com.br' @mock.patch('requests.get') def test_get_urls_from_entry_url_mock_requests(self, mock_get): mock_resp = self._mock_response( text='<html><a href="http://www.google.com">Google</a>' '<a href="https://gist.github.com">github</a></html>') mock_get.return_value = mock_resp