def check_conn (ip = "localhost", port = 27017): try: port = int(port) conn = Connection (host = ip, port = port) except Exception as E: logging.error (E) return False else: conn.close () return True
class MongoDBStore(SimpleStore): '''MongoDB-based object storage frontend.''' init = 'mongodb://' def __init__(self, engine, **kw): super(MongoDBStore, self).__init__(engine, **kw) spliturl = urlsplit(engine) _, dbpath, self._colpath = spliturl.path.split('/') self._conn = Connection(host=spliturl.hostname, port=spliturl.port) self._db = getattr(self._conn, dbpath) self._store = getattr(self._db, self._colpath) self._store.ensure_index('key', unique=True) def __getitem__(self, key): try: return self.loads(self._store.find_one(dict(key=key))['value'], key) except TypeError: raise KeyError(key) def __setitem__(self, key, value): self._store.save(dict(key=key, value=Binary(self.dumps(value)))) def __delitem__(self, key): self._store.remove(dict(key=key)) def __len__(self): return self._store.count() def __iter__(self): for key in self._store.find( dict(key={'$exists': True}), fields=['key'], ): yield key['key'] def close(self): self._conn.close() def clear(self): self._store.drop() self._store = getattr(self._db, self._colpath) def items(self): for key in self._store.find({'key': {'$exists': True}}): yield (key['key'], key['value']) def values(self): loads = self.loads for value in self._store.distinct('key'): yield loads(value['value'])
class MongoDBStore(SimpleStore): '''MongoDB-based object storage frontend.''' init = 'mongodb://' def __init__(self, engine, **kw): super(MongoDBStore, self).__init__(engine, **kw) spliturl = urlsplit(engine) _, dbpath, self._colpath = spliturl.path.split('/') self._conn = Connection(host=spliturl.hostname, port=spliturl.port) self._db = getattr(self._conn, dbpath) self._store = getattr(self._db, self._colpath) self._store.ensure_index('key', unique=True) def __getitem__(self, key): try: return self.loads(self._store.find_one(dict(key=key))['value']) except TypeError: raise KeyError(key) def __setitem__(self, key, value): self._store.save(dict(key=key, value=Binary(self.dumps(value)))) def __delitem__(self, key): self._store.remove(dict(key=key)) def __len__(self): return self._store.count() def __iter__(self): for key in self._store.find( dict(key={'$exists': True}), fields=['key'], ): yield key['key'] def close(self): self._conn.close() def clear(self): self._store.drop() self._store = getattr(self._db, self._colpath) def items(self): for key in self._store.find({'key': {'$exists': True}}): yield (key['key'], key['value']) def values(self): loads = self.loads for value in self._store.distinct('key'): yield loads(value['value'])
def init_messaging_agents(): mongodb = None while not mongodb: try: mongodb = Connection(options.mongohost, options.mongoport) except Exception as ex: logging.error(ex) # wait 5 secs to reconnect time.sleep(5) masterdb = mongodb[options.masterdb] apps = masterdb.applications.find() httpconns = {} apnsconns = {} for app in apps: ''' APNs setup ''' apnsconns[app['shortname']] = [] conns = int(app['connections']) if conns < 1: conns = 1 if 'environment' not in app: app['environment'] = 'sandbox' if 'certfile' in app and 'keyfile' in app and 'shortname' in app: for instanceid in range(0, conns): try: apn = APNClient(app['environment'], app['certfile'], app['keyfile'], app['shortname'], instanceid) except Exception as ex: logging.error(ex) continue apnsconns[app['shortname']].append(apn) ''' GCMClient setup ''' httpconns[app['shortname']] = [] if 'gcmprojectnumber' in app and 'gcmapikey' in app and 'shortname' in app: try: http = GCMClient(app['gcmprojectnumber'], app['gcmapikey'], app['shortname'], 0) except Exception as ex: logging.error(ex) continue httpconns[app['shortname']].append(http) mongodb.close() return apnsconns, httpconns
def init_messaging_agents(): mongodb = None while not mongodb: try: mongodb = Connection(options.mongohost, options.mongoport) except Exception as ex: logging.error(ex) # wait 5 secs to reconnect time.sleep(5) masterdb = mongodb[options.masterdb] apps = masterdb.applications.find() httpconns = {} apnsconns = {} for app in apps: ''' APNs setup ''' apnsconns[app['shortname']] = [] conns = int(app['connections']) if conns < 1: conns = 1 if 'environment' not in app: app['environment'] = 'sandbox' if 'certfile' in app and 'keyfile' in app and 'shortname' in app: for instanceid in range(0, conns): try: apn = APNClient(app['environment'], app['certfile'], app['keyfile'], app['shortname'], instanceid) except Exception as ex: logging.error(ex) continue apnsconns[app['shortname']].append(apn) ''' GCMClient setup ''' httpconns[app['shortname']] = [] if 'gcmprojectnumber' in app and 'gcmapikey' in app and 'shortname' in app: try: http = GCMClient(app['gcmprojectnumber'], app['gcmapikey'], app['shortname'], 0) except Exception as ex: logging.error(ex) continue httpconns[app['shortname']].append(http) mongodb.close() return apnsconns, httpconns
def init_messaging_agents(): services = { 'gcm': {}, 'wns': {}, 'apns': {}, 'mpns': {}, 'sms': {}, } mongodb = None while not mongodb: try: mongodb = Connection(options.mongohost, options.mongoport) except Exception as ex: _logger.error(ex) masterdb = mongodb[options.masterdb] apps = masterdb.applications.find() for app in apps: ''' APNs setup ''' services['apns'][app['shortname']] = [] conns = int(app['connections']) if conns < 1: conns = 1 if 'environment' not in app: app['environment'] = 'sandbox' if file_exists(app.get('certfile', False)) and file_exists( app.get('keyfile', False)) and 'shortname' in app: if app.get('enableapns', False): for instanceid in range(0, conns): try: apn = APNClient(app['environment'], app['certfile'], app['keyfile'], app['shortname'], instanceid) except Exception as ex: _logger.error(ex) continue services['apns'][app['shortname']].append(apn) ''' GCMClient setup ''' services['gcm'][app['shortname']] = [] if 'gcmprojectnumber' in app and 'gcmapikey' in app and 'shortname' in app: try: http = GCMClient(app['gcmprojectnumber'], app['gcmapikey'], app['shortname'], 0) except Exception as ex: _logger.error(ex) continue services['gcm'][app['shortname']].append(http) ''' WNS setup ''' services['wns'][app['shortname']] = [] if 'wnsclientid' in app and 'wnsclientsecret' in app and 'shortname' in app: try: wns = WNSClient(masterdb, app, 0) except Exception as ex: _logger.error(ex) continue services['wns'][app['shortname']].append(wns) ''' MPNS setup ''' services['mpns'][app['shortname']] = [] try: mpns = MPNSClient(masterdb, app, 0) except Exception as ex: _logger.error(ex) continue services['mpns'][app['shortname']].append(mpns) ''' clickatell ''' services['sms'][app['shortname']] = [] try: sms = ClickatellClient(masterdb, app, 0) except Exception as ex: _logger.error(ex) continue services['sms'][app['shortname']].append(sms) mongodb.close() return services
def init_messaging_agents(): services = {"gcm": {}, "wns": {}, "apns": {}, "mpns": {}, "sms": {}} mongodb = None while not mongodb: try: mongodb = Connection(options.mongohost, options.mongoport) except Exception as ex: _logger.error(ex) masterdb = mongodb[options.masterdb] apps = masterdb.applications.find() for app in apps: """ APNs setup """ services["apns"][app["shortname"]] = [] conns = int(app["connections"]) if conns < 1: conns = 1 if "environment" not in app: app["environment"] = "sandbox" if file_exists(app.get("certfile", False)) and file_exists(app.get("keyfile", False)) and "shortname" in app: if app.get("enableapns", False): for instanceid in range(0, conns): try: apn = APNClient( app["environment"], app["certfile"], app["keyfile"], app["shortname"], instanceid ) except Exception as ex: _logger.error(ex) continue services["apns"][app["shortname"]].append(apn) """ GCMClient setup """ services["gcm"][app["shortname"]] = [] if "gcmprojectnumber" in app and "gcmapikey" in app and "shortname" in app: try: http = GCMClient(app["gcmprojectnumber"], app["gcmapikey"], app["shortname"], 0) except Exception as ex: _logger.error(ex) continue services["gcm"][app["shortname"]].append(http) """ WNS setup """ services["wns"][app["shortname"]] = [] if "wnsclientid" in app and "wnsclientsecret" in app and "shortname" in app: try: wns = WNSClient(masterdb, app, 0) except Exception as ex: _logger.error(ex) continue services["wns"][app["shortname"]].append(wns) """ MPNS setup """ services["mpns"][app["shortname"]] = [] try: mpns = MPNSClient(masterdb, app, 0) except Exception as ex: _logger.error(ex) continue services["mpns"][app["shortname"]].append(mpns) """ clickatell """ services["sms"][app["shortname"]] = [] try: sms = ClickatellClient(masterdb, app, 0) except Exception as ex: _logger.error(ex) continue services["sms"][app["shortname"]].append(sms) mongodb.close() return services
class Database(): def __init__(self, db_name, drop_dbs=True): self.connection=Connection() self.database = self.connection[db_name] self.pages = self.database.pages #self.pages.ensure_index( "id", pymongo.ASCENDING, unique=True) self.urls = self.database.urls self.url_descriptions = self.database.url_describtion self.clickables = self.database.clickables self.delta_pages = self.database.delta_pages self.forms = self.database.forms self.users = self.database.users self.clusters = self.database.clusters self.attack = self.database.attack self.async_requests = self.database.asyncrequests self.async_request_structure = self.database.asyncrequeststructure self._per_session_url_counter = 0 if drop_dbs: self.pages.drop() #Clear database self.urls.drop() #Clear database self.url_descriptions.drop() self.delta_pages.drop() self.clickables.drop() self.forms.drop() self.users.drop() self.clusters.drop() self.attack.drop() self.async_requests.drop() self.async_request_structure.drop() self.urls.ensure_index("url", pymongo.ASCENDING, unique=True) #self.url_descriptions.ensure_index("hash", pymongo.ASCENDING, unique=True) def __del__(self): self.connection.close() def prepare_for_new_crawling(self): self._per_session_url_counter = 0 def get_user_to_username(self, username): search_doc = {'username': username} user = self.users.find_one(search_doc) if user is None: return None return user['session'] def insert_user_into_db(self, user): num_of_users = self.users.count() user_id = num_of_users + 1 doc = self._user_to_doc(user) doc['_id'] = user_id if user.login_data is not None and user.url_with_login_form is not None: doc['url_with_login_form'] = user.url_with_login_form doc['login_data'] = user.login_data self.users.save(doc) def _user_to_doc(self, user): doc = {} doc['user_level'] = user.user_level doc['username'] = user.username doc['session'] = user.session return doc def insert_url_into_db(self, current_session, url, is_redirected_url = False): """ :param current_session: :param url: :param is_redirected_url: :return: True if url is insert, if url exists False """ if self.urls.find({"url": url.toString(), "session": current_session}).count() > 0 or self.urls.find({"redirected_to": url.toString(), "session": current_session}).count() > 0: return False document = self._url_to_doc_without_abstract_url(url) document['session'] = current_session document["url_counter"] = self._per_session_url_counter self._per_session_url_counter += 1 self.urls.save(document) return True def _url_to_doc_without_abstract_url(self, url): doc = {} doc["url"] = url.complete_url #doc["abstract_url"] = url.abstract_url doc["url_hash"] = url.url_hash doc["page_id"] = None doc["visited"] = False doc["response_code"] = None doc['redirected_to'] = None doc['depth_of_finding'] = url.depth_of_finding return doc def get_next_url_for_crawling(self, current_session): urls = self.urls.find({"session": current_session, "visited": False}).sort([('url_counter', pymongo.ASCENDING)]).limit(1) if urls is None or urls.count() == 0: return None else: url = self._parse_url_from_db(urls[0]) url.url_structure = self.get_url_structure_from_db(current_session, url.url_hash) return url def get_all_unvisited_urls_sorted_by_hash(self, current_session): """ @:returns dict(url_hash) = list(urls) """ raw_data = self.urls.find({"session": current_session, "visited": False}) result = {} for url in raw_data: tmp = self._parse_url_from_db_withou_abstract_url(url) tmp.url_structure = self.get_url_structure_from_db(current_session, tmp.url_hash) if tmp.url_hash in result: result[tmp.url_hash].append(tmp) else: result[tmp.url_hash] = [tmp] return result def _parse_url_from_db_withou_abstract_url(self, url): result = Url(url['url'], url['depth_of_finding']) result.parameters = self.unescape_unloved_signs(result.parameters) return result def _parse_url_from_db(self, url): result = Url(url['url'], url['depth_of_finding']) result.parameters = self.unescape_unloved_signs(result.parameters) result.abstract_url = url["abstract_url"] return result def get_urls_from_db_to_hash(self, current_session, url_hash): urls = self.urls.find({"session": current_session, "url_hash": url_hash}) result = [] url_description = self.get_url_structure_from_db(current_session, url_hash) for url in urls: url = self._parse_url_from_db(url) url.url_description = url_description result.append(url) return result def visit_url(self, current_session, url, webpage_id, response_code, redirected_to=None): search_doc = {} try: search_doc['url'] = url.toString() except AttributeError: search_doc['url'] = url search_doc['session'] = current_session update_doc = {'response_code': response_code, 'visited': True, 'page_id': webpage_id, 'redirected_to': redirected_to} self.urls.update(search_doc, {"$set": update_doc}) def count_visited_urls_per_hash(self, current_session, url_hash): all_urls = self.urls.find({"session": current_session, "url_hash": url_hash, "visited": True}) counter = 0 for url in all_urls: if url['response_code'] > 100 and url['response_code'] < 1000: counter += 1 return counter def get_url_to_id(self, current_session, id): result = self.urls.find_one({"session":current_session, "page_id": id}) if result is None: return None return result['url'] def insert_page_into_db(self, current_session, web_page): for clickable in web_page.clickables: self._insert_clickable_into_db(current_session, web_page.id, clickable) for form in web_page.forms: self.insert_form(current_session, form, web_page.id) document = self._create_webpage_doc(web_page, current_session) document['ajax_requests'] = [] document['session'] = current_session self.pages.save(document) def get_all_pages(self, current_session): results = [] pages = self.pages.find({"session": current_session}) for page in pages: results.append(self._get_web_page_from_db(current_session=current_session, page=page)) return results def get_webpage_to_url_from_db(self, current_session, url): return self._get_web_page_from_db(current_session, url=url) def get_webpage_to_id_from_db(self, current_session, id): return self._get_web_page_from_db(current_session, page_id=id) def url_visited(self, current_session, url): try: url = url.toString() except AttributeError: url = url return self.pages.find_one({"session": current_session, "url": url, "visited": True}) is not None def _get_web_page_from_db(self, current_session, page_id=None, url=None, page=None): if page is None: if page_id is not None: page = self.pages.find_one({"session": current_session, "web_page_id": page_id }) elif url is not None: page = self.pages.find_one({"session": current_session, "url": url}) else: raise AttributeError("You must specifies either page_id or url") if page is None: return None clickables = self.get_all_clickables_to_page_id_from_db(current_session, page['web_page_id']) forms = self.get_all_forms_to_page_id_from_db(current_session, page['web_page_id']) result = WebPage(page['web_page_id'], page['url'], page['html'], None, page['current_depth'], page['base_url']) result.clickables = clickables result.forms = forms links = [] for link in page['links']: links.append(self._parse_link_from_db(link)) result.links = links timemimg_requests = [] for request in page['timing_requests']: timemimg_requests.append(self.get_asyncrequest_to_id(current_session, request)) result.timing_requests = timemimg_requests ajax = [] for request in page['ajax_requests']: ajax.append(self.get_asyncrequest_to_id(current_session, request)) result.ajax_requests = ajax return result def insert_asyncrequest(self, current_session, ajax_request, web_page_id): url_doc = {"url": ajax_request.url.complete_url, "abstract_url": ajax_request.url.abstract_url, "url_hash": ajax_request.url.url_hash} structure_doc = {} structure_doc['request_hash'] = ajax_request.request_hash structure_doc["session"] = current_session structure_doc['parameters'] = ajax_request.request_structure.parameters previous_doc = self.async_request_structure.find_one({"session": current_session, "request_hash": ajax_request.request_hash}) if previous_doc is not None: structure_doc["_id"] = previous_doc['_id'] res = self.async_request_structure.save(structure_doc) doc = {} doc["request_hash"] = ajax_request.request_hash doc["url"] = url_doc doc["method"] = ajax_request.method doc["session"] = current_session try: trigger_id = self.clickables.find_one({"session": current_session, "dom_address" : ajax_request.trigger.dom_address, "web_page_id": web_page_id, "event": ajax_request.trigger.event}) trigger_id = trigger_id["_id"] doc["trigger"] = trigger_id except AttributeError: try: doc['event'] = ajax_request.event except AttributeError: logging.debug("This should never happen...") except TypeError: find_string = "session: " + str(current_session) + " - dom_address: " + str(ajax_request.trigger.dom_address) + " - web_page_id: " + str(web_page_id) + " - event: "+ (ajax_request.trigger.event) logging.debug("Try to find: {}".format(find_string)) doc['parameters'] = ajax_request.parameters return self.async_requests.save(doc) def get_asyncrequest_to_id(self, current_session, async_id): raw_data = self.async_requests.find_one({"session": current_session, "_id": async_id}) if raw_data is None: return None raw_structure = self.async_request_structure.find_one({"session": current_session, "request_hash": raw_data['request_hash']}) structure = AsyncRequestStructure(raw_structure['request_hash'], raw_structure['parameters']) url = Url(raw_data['url']['url']) url.abstract_url = raw_data['url']['abstract_url'] if "event" in raw_data: tmp = TimingRequest(raw_data['method'], url, None, raw_data['event'], parameters=raw_data['parameters']) tmp.request_structure = structure else: trigger = self.clickables.find_one({"_id": raw_data['trigger']}) trigger = self._parse_clickable_from_db_to_model(trigger) tmp = AjaxRequest(raw_data['method'], url, trigger, parameters=raw_data['parameters']) tmp.request_structure = structure return tmp def _parse_clickable_from_db_to_model(self, clickable): c = Clickable(clickable['event'], clickable['tag'], clickable['dom_address'], clickable['html_id'], clickable['html_class'], clickable['clickable_depth'], clickable['function_id']) c.clicked = clickable['clicked'] c.clickable_type = self._num_to_clickable_type(clickable['clickable_type']) c.links_to = clickable['links_to'] c.clickable_depth = clickable['clickable_depth'] return c def _insert_clickable_into_db(self, current_session , web_page_id, clickable): document = {} document["event"]= clickable.event document["tag"] = clickable.tag document["html_class"] = clickable.html_class document["html_id"] = clickable.id document["dom_address"] = clickable.dom_address document["links_to"] = clickable.links_to document["clicked"] = clickable.clicked document["clickable_type"] = self._clickable_type_to_num(clickable.clickable_type) document["web_page_id"] = web_page_id document["function_id"] = clickable.function_id document["clickable_depth"] = clickable.clickable_depth if hasattr(clickable, "random_char"): document['random_char'] = clickable.random_char document['session'] = current_session self.clickables.save(document) def insert_delta_page_into_db(self, current_session, delta_page): for clickable in delta_page.clickables: self._insert_clickable_into_db(current_session, delta_page.id, clickable) for form in delta_page.forms: self.insert_form(current_session, form, delta_page.id) document = self._create_webpage_doc(delta_page, current_session) clickable_id = self.clickables.find_one({"session" : current_session, "web_page_id": delta_page.parent_id, "dom_address":delta_page.generator.dom_address, "event":delta_page.generator.event}) clickable_id = clickable_id["_id"] document['generator'] = clickable_id generator_request_doc = [] for r in delta_page.generator_requests: generator_request_doc.append(self.insert_asyncrequest(current_session, r, delta_page.parent_id)) document["generator_requests"] = generator_request_doc document['delta_depth'] = delta_page.delta_depth document['parent_id'] = delta_page.parent_id ajax_request_docs = [] for ajax in delta_page.ajax_requests: ajax_request_docs.append(self.insert_asyncrequest(current_session, ajax, delta_page.id)) document["ajax_requests"] = ajax_request_docs document['session'] = current_session self.delta_pages.save(document) def get_delta_page_to_id(self, current_session, page_id): page = self.delta_pages.find_one({"session": current_session,"web_page_id":page_id }) if page is None: return None result = self._parse_delta_page_from_db(current_session, page) return result def _create_webpage_doc(self, web_page, current_session): document = {} document["web_page_id"] = web_page.id document["url"] = web_page.url document["html"] = web_page.html document["links"] = [] for link in web_page.links: document["links"].append(self._parse_link_to_db_doc(link)) timeming_requests_doc = [] for timing_request in web_page.timing_requests: timeming_requests_doc.append(self.insert_asyncrequest(current_session, timing_request, web_page.id)) document['timing_requests'] = timeming_requests_doc document["current_depth"] = web_page.current_depth document['base_url'] = web_page.base_url return document def insert_form(self, current_session, form, page_id): form_hash = form.get_hash() result = self.forms.find_one({"form_hash": form_hash, "session": current_session, "web_page_id": page_id}) form_doc = {} if result is not None: for parameter_from_db_form in result['parameters']: for parameter_from_new_form in form.parameter: if parameter_from_db_form['name'] == parameter_from_new_form.name: parameter_from_new_form.values.extend(parameter_from_db_form['values']) parameter_from_new_form.values = sorted(set(parameter_from_new_form.values), key=lambda x: parameter_from_new_form.values.index(x)) #Deduplicates the list form_doc['_id'] = result["_id"] form_doc["web_page_id"] = page_id form_doc["method"] = form.method action_doc = {"url": form.action.complete_url, "abstract_url": form.action.abstract_url, "url_hash": form.action.url_hash} form_doc["action"] = action_doc form_doc["dom_address"] = form.dom_address param_doc = [] for parameter in form.parameter: param_doc.append(self._parse_form_parameter_to_db_doc(parameter)) form_doc['parameters'] = param_doc form_doc['session'] = current_session form_doc['form_hash'] = form_hash self.forms.save(form_doc) def _parse_link_to_db_doc(self, link): res = {} url = {"url": link.url.complete_url, "abstract_url": link.url.abstract_url, "url_hash": link.url.url_hash, "depth_of_finding": link.url.depth_of_finding} res['url'] = url res['dom_address'] = link.dom_address res['html_id'] = link.html_id res['html_class'] = link.html_class return res def _parse_link_from_db(self, link): url = Url(link['url']['url']) url.abstract_url = link['url']['abstract_url'] url.depth_of_finding = link['url']['depth_of_finding'] result = Link(url, link['dom_address'], link['html_id'], link['html_class']) return result def _parse_form_parameter_to_db_doc(self, form_parameter): param = {} param["tag"] = form_parameter.tag param["name"] = form_parameter.name param["values"] = form_parameter.values param["input_type"] = form_parameter.input_type return param def set_clickable_clicked(self, current_session,web_page_id, clickable_dom_address, clickable_event, clickable_depth = None ,clickable_type = None, links_to=None): search_doc = {} set_doc = {} search_doc["web_page_id"] = web_page_id search_doc["dom_address"] = clickable_dom_address search_doc["event"] = clickable_event search_doc['session'] = current_session if clickable_type is not None: clickable_type = self._clickable_type_to_num(clickable_type) set_doc["clickable_type"] = clickable_type if links_to is not None: set_doc['links_to'] = links_to if clickable_depth is not None: set_doc['clickable_depth'] = clickable_depth set_doc["clicked"] = True set_doc = {"$set": set_doc} result = self.clickables.update(search_doc, set_doc) return result def set_clickable_ignored(self, current_session, web_page_id, clickable_dom_address, clickable_event, clickable_depth = None, clickable_type = None): search_doc = {} set_doc = {} search_doc["web_page_id"] = web_page_id search_doc["dom_address"] = clickable_dom_address search_doc["event"] = clickable_event search_doc['session'] = current_session if clickable_type is not None: clickable_type = self._clickable_type_to_num(clickable_type) set_doc["clickable_type"] = clickable_type if clickable_depth is not None: set_doc['clickable_depth'] = clickable_depth set_doc["clicked"] = "False" set_doc = {"$set": set_doc} result = self.clickables.update(search_doc, set_doc) return result def extend_ajax_requests_to_webpage(self, current_session, webpage, ajax_requests): ajax_requests_doc = [] for r in ajax_requests: ajax_requests_doc.append(self.insert_asyncrequest(current_session, r, webpage.id)) if not hasattr(webpage, 'parent_id'): result = self.pages.update({"web_page_id": webpage.id, "session":current_session}, { "$addToSet" : {"ajax_requests": {"$each": ajax_requests_doc}}}) else: result = self.delta_pages.update({"web_page_id": webpage.id, "session":current_session}, { "$addToSet" : {"ajax_requests": {"$each": ajax_requests_doc}}}) def _clickable_type_to_num(self, clickable_type): if clickable_type == ClickableType.UIChange: return 0 if clickable_type == ClickableType.Link: return 1 if clickable_type == ClickableType.CreatesNewNavigatables: return 2 if clickable_type == ClickableType.Error: return 3 if clickable_type == ClickableType.SendingAjax: return 4 if clickable_type == ClickableType.IgnoredByCrawler: return 5 if clickable_type == ClickableType.UnsupportedEvent: return 6 def _num_to_clickable_type(self, num): if num is None: return None clickable_types = { 0 : ClickableType.UIChange, 1: ClickableType.Link, 2: ClickableType.CreatesNewNavigatables, 3: ClickableType.Error, 4: ClickableType.SendingAjax, 5: ClickableType.IgnoredByCrawler, 6: ClickableType.UnsupportedEvent } return clickable_types[num] def get_all_clickables_to_page_id_from_db(self, current_session, page_id): clickables = self.clickables.find({"web_page_id": page_id, "session": current_session}) result = [] for clickable in clickables: c = Clickable(clickable['event'], clickable['tag'], clickable['dom_address'], clickable['html_id'], clickable['html_class'], clickable_depth=clickable['clickable_depth'], function_id=clickable['function_id']) c.links_to = clickable['links_to'] c.clickable_type = self._num_to_clickable_type(clickable['clickable_type']) c.clicked = clickable['clicked'] result.append(c) return result def get_all_forms_to_page_id_from_db(self, current_session, page_id): forms = self.forms.find({"web_page_id" : page_id, "session": current_session}) result = [] for form in forms: result.append(self._parse_form_from_db(form)) return result def _parse_form_from_db(self, form): parameters = [] for p in form['parameters']: form_input = FormInput(p['tag'], p['name'], p['input_type'], p['values']) parameters.append(form_input) action = Url(form['action']["url"]) action.abstract_url = form['action']["abstract_url"] return HtmlForm(parameters, action, form['method'], form["dom_address"]) def get_all_crawled_deltapages_to_url_from_db(self, current_session, url): pages = self.delta_pages.find({"url":url, "session":current_session}) result = [] for page in pages: result.append(self._parse_delta_page_from_db(current_session, page)) return result def _parse_delta_page_from_db(self, current_session, page): clickables = self.get_all_clickables_to_page_id_from_db(current_session, page['web_page_id']) forms = self.get_all_forms_to_page_id_from_db(current_session, page['web_page_id']) generator = self.clickables.find_one({"_id": page['generator']}) generator = self._parse_clickable_from_db_to_model(generator) generator_requests = [] for g in page['generator_requests']: generator_requests.append(self.get_asyncrequest_to_id(current_session, g)) result = DeltaPage(page['web_page_id'], page['url'], page['html'], None, page['current_depth'], generator, page['parent_id'], page['delta_depth']) links = [] for link in page['links']: links.append(self._parse_link_from_db(link)) result.links = links result.forms = forms result.clickables = clickables result.generator_requests = generator_requests return result def insert_url_structure_into_db(self, current_session, url_description): search_doc = {"hash": url_description.url_hash} result = self.url_descriptions.find_one({"session": current_session, "url_hash": url_description.url_hash}) document = {} if result is not None: document["_id"] = result["_id"] document["path"] = url_description.path document["parameters"] = self.escape_unloved_signs(url_description.parameters) document["url_hash"] = url_description.url_hash document["session"] = current_session result = self.url_descriptions.save(document) def escape_unloved_signs(self, item): if isinstance(item, dict): for old_key in item: item[old_key.replace(".", "$$$")] = item.pop(old_key) return item def unescape_unloved_signs(self, item): if isinstance(item, dict): for old_key in item: item[old_key.replace("$$$", ".")] = item.pop(old_key) return item def get_url_structure_from_db(self, current_session, url_hash): result = self.url_descriptions.find_one({"session": current_session, "url_hash": url_hash}) if result is None: return None return UrlStructure(result['path'], result["parameters"], result['url_hash']) def url_exists(self, current_session, url): try: search_url = url.toString() except AttributeError: search_url = url return self.urls.find({"url": search_url, "session":current_session}).count() > 0 def write_cluster(self, current_session, url_hash, clusters): self.clusters.remove({"session": current_session, "url_hash": url_hash}) self.clusters.save({"session": current_session, "url_hash": url_hash, "clusters": clusters}) def get_clusters(self, current_session, url_hash): result = self.clusters.find_one({"session": current_session, "url_hash": url_hash}) try: return result["clusters"] except TypeError: return None def get_all_url_structures(self, current_session): raw_data = self.url_descriptions.find({"session": current_session}) result = [] for url_structure in raw_data: result.append(UrlStructure(url_structure['path'], url_structure["parameters"], url_structure['url_hash'])) return result def get_all_successfully_visited_urls(self, current_session): raw_data = self.urls.find({"session": current_session, "visited": True}) result = [] for url in raw_data: if url["response_code"] > 0 and url['response_code'] < 1000: result.append(self._parse_url_from_db(url)) return result def get_one_visited_url_per_structure(self, current_session): raw_data = self.urls.find({"session": current_session, "visited": True}) result = [] seen_structures = [] for url in raw_data: if url["response_code"] > 0 and url['response_code'] < 1500: tmp = self._parse_url_from_db_withou_abstract_url(url) if tmp.url_hash not in seen_structures: result.append(tmp) seen_structures.append(tmp.url_hash) return result def insert_attack_result(self, current_session, result, attack_url): doc = {"session": current_session, "attack_url": attack_url, "result": result.value} self.attack.save(doc) def get_asyncrequest_structure(self, current_session, structure_hash= None): if structure_hash is not None: raw_data = self.async_request_structure.find_one({"session": current_session, "request_hash": structure_hash}) if raw_data is None: return None return AsyncRequestStructure(raw_data['request_hash'], raw_data['parameters']) else: return None #TODO: Implement if I need all def get_all_get_forms(self, current_session): raw_data = self.forms.find({"session": current_session, "method": "get"}) result = [] for form in raw_data: tmp = self._parse_form_from_db(form) if tmp not in result: result.append(tmp) return result def _exclude_dupplicates_forms(self, forms): result = [] for form in forms: already_in = False for res in result: if self._form_paramters_equal(form.parameter, res.parameter) and form.action.complete_url == res.action.complete_url: already_in = True break if not already_in: result.append(form) return result def get_one_form_per_destination(self, current_session): all_forms = self.get_all_get_forms(current_session) all_forms = self._exclude_dupplicates_forms(all_forms) return all_forms def _form_paramters_equal(self, p1, p2): for param in p1: is_in = False for param2 in p2: if param.input_type == param2.input_type and param.name == param2.name and param.tag == param2.tag: is_in = True break if not is_in: return False return True def num_of_ignored_urls(self, current_session, url_hash): return self.urls.find({"url_hash": url_hash, "session": current_session, "response_code": 0}).count() def get_id_to_url(self, current_session, url): """ :param current_session: :param url: :return: PageId if available, if not -1... I cannot use None here, because it might happen that the Id is None because of errors or 500 codes or other reasons... """ raw_data = self.urls.find_one({"session": current_session, "url": url}) if raw_data is not None: return raw_data["page_id"] raw_data = self.urls.find_one({"session": current_session, "redirected_to": url}) if raw_data is not None: return raw_data["page_id"] return -1 def get_all_urls_to_domain(self, current_session, domain): raw_data = self.urls.find({"session": current_session, "visited": True}) result = [] for url in raw_data: if domain in url["url"]: result.append(self._parse_url_from_db_withou_abstract_url(url)) return result
def init_messaging_agents(): services = { 'gcm': {}, 'wns': {}, 'apns': {}, 'mpns': {}, 'sms': {}, } mongodb = None while not mongodb: try: mongodb = Connection(options.mongohost, options.mongoport) except Exception as ex: logging.error(ex) masterdb = mongodb[options.masterdb] apps = masterdb.applications.find() for app in apps: ''' APNs setup ''' services['apns'][app['shortname']] = [] conns = int(app['connections']) if conns < 1: conns = 1 if 'environment' not in app: app['environment'] = 'sandbox' if file_exists(app.get('certfile', False)) and file_exists(app.get('keyfile', False)) and 'shortname' in app: if app.get('enableapns', False): for instanceid in range(0, conns): try: apn = APNClient(app['environment'], app['certfile'], app['keyfile'], app['shortname'], instanceid) except Exception as ex: logging.error(ex) continue services['apns'][app['shortname']].append(apn) ''' GCMClient setup ''' services['gcm'][app['shortname']] = [] if 'gcmprojectnumber' in app and 'gcmapikey' in app and 'shortname' in app: try: http = GCMClient(app['gcmprojectnumber'], app['gcmapikey'], app['shortname'], 0) except Exception as ex: logging.error(ex) continue services['gcm'][app['shortname']].append(http) ''' WNS setup ''' services['wns'][app['shortname']] = [] if 'wnsclientid' in app and 'wnsclientsecret' in app and 'shortname' in app: try: wns = WNSClient(masterdb, app, 0) except Exception as ex: logging.error(ex) continue services['wns'][app['shortname']].append(wns) ''' MPNS setup ''' services['mpns'][app['shortname']] = [] try: mpns = MPNSClient(masterdb, app, 0) except Exception as ex: logging.error(ex) continue services['mpns'][app['shortname']].append(mpns) ''' clickatell ''' services['sms'][app['shortname']] = [] try: sms = ClickatellClient(masterdb, app, 0) except Exception as ex: logging.error(ex) continue services['sms'][app['shortname']].append(sms) mongodb.close() return services
class DatabaseWrapper(NonrelDatabaseWrapper): """ Public API: connection, database, get_collection. """ def __init__(self, *args, **kwargs): self.collection_class = kwargs.pop('collection_class', Collection) super(DatabaseWrapper, self).__init__(*args, **kwargs) self.features = DatabaseFeatures(self) self.ops = DatabaseOperations(self) self.creation = DatabaseCreation(self) self.introspection = DatabaseIntrospection(self) self.validation = DatabaseValidation(self) self.connected = False ''' this dictates if we want to close connection at the end of each request or cache it (old behavior) ''' self.close_connection = kwargs.pop('close_connection', True) ''' number of times to retry the connection on failure ''' self.conn_retries = kwargs.pop('conn_retries', 1) ''' time to sleep between retries ''' self.conn_sleep_interval = kwargs.pop('conn_sleep_interval', 5) del self.connection def get_collection(self, name, **kwargs): if (kwargs.pop('existing', False) and name not in self.database.collection_names()): return None collection = self.collection_class(self.database, name, **kwargs) if settings.DEBUG: collection = CollectionDebugWrapper(collection, self.alias) return collection def __getattr__(self, attr): if attr in ['connection', 'database']: assert not self.connected self._connect() return getattr(self, attr) raise AttributeError(attr) def _connect(self): settings = copy.deepcopy(self.settings_dict) def pop(name, default=None): return settings.pop(name) or default db_name = pop('NAME') host = pop('HOST') port = pop('PORT') user = pop('USER') password = pop('PASSWORD') options = pop('OPTIONS', {}) self.operation_flags = options.pop('OPERATIONS', {}) if not any(k in ['save', 'delete', 'update'] for k in self.operation_flags): # Flags apply to all operations. flags = self.operation_flags self.operation_flags = {'save': flags, 'delete': flags, 'update': flags} # Lower-case all OPTIONS keys. for key in options.iterkeys(): options[key.lower()] = options.pop(key) attempts = 0 while True: try: try: self.connection = Connection(host=host, port=port, **options) self.database = self.connection[db_name] except TypeError: exc_info = sys.exc_info() raise ImproperlyConfigured, exc_info[1], exc_info[2] if user and password: if not self.database.authenticate(user, password): raise ImproperlyConfigured("Invalid username or password.") self.connected = True connection_created.send(sender=self.__class__, connection=self) ''' execute a quick sample query so that the failure will happen on the command that is run after the switchover, auth succeeds on secondary but commands cannot be run. This command will throw an exception and hence we will attempt to reconnect again ''' self.database['system.indexes'].find_one() break except Exception as e: print 'MongoConnectionFailure to database %s %s' % (db_name,str(e)) print traceback.format_exc() ''' Make sure we set connected to False just in case we failed on the send ''' self.connected = False ''' initialize these instance variables, so that we can delete them ''' ''' this will ensure that the __get_attr__ class method properly reconnects ''' self.database = None self.connection = None del self.database del self.connection attempts += 1 if attempts < self.conn_retries: time.sleep(self.conn_sleep_interval) print 'MongoConnectionRetry attempt=%d' % attempts continue raise e def _reconnect(self): if self.connected: del self.connection del self.database self.connected = False self._connect() def _commit(self): pass def _rollback(self): pass def close(self): if self.close_connection and self.connected: self.connection.close() del self.database del self.connection self.connected = False pass
class Database(): def __init__(self, db_name, drop_dbs=True): self.connection = Connection() self.database = self.connection[db_name] self.pages = self.database.pages #self.pages.ensure_index( "id", pymongo.ASCENDING, unique=True) self.urls = self.database.urls self.url_descriptions = self.database.url_describtion self.clickables = self.database.clickables self.delta_pages = self.database.delta_pages self.forms = self.database.forms self.users = self.database.users self.clusters = self.database.clusters self.attack = self.database.attack self.async_requests = self.database.asyncrequests self.async_request_structure = self.database.asyncrequeststructure self._per_session_url_counter = 0 if drop_dbs: self.pages.drop() #Clear database self.urls.drop() #Clear database self.url_descriptions.drop() self.delta_pages.drop() self.clickables.drop() self.forms.drop() self.users.drop() self.clusters.drop() self.attack.drop() self.async_requests.drop() self.async_request_structure.drop() self.urls.ensure_index("url", pymongo.ASCENDING, unique=True) #self.url_descriptions.ensure_index("hash", pymongo.ASCENDING, unique=True) def __del__(self): self.connection.close() def prepare_for_new_crawling(self): self._per_session_url_counter = 0 def get_user_to_username(self, username): search_doc = {'username': username} user = self.users.find_one(search_doc) if user is None: return None return user['session'] def insert_user_into_db(self, user): num_of_users = self.users.count() user_id = num_of_users + 1 doc = self._user_to_doc(user) doc['_id'] = user_id if user.login_data is not None and user.url_with_login_form is not None: doc['url_with_login_form'] = user.url_with_login_form doc['login_data'] = user.login_data self.users.save(doc) def _user_to_doc(self, user): doc = {} doc['user_level'] = user.user_level doc['username'] = user.username doc['session'] = user.session return doc def insert_url_into_db(self, current_session, url, is_redirected_url=False): """ :param current_session: :param url: :param is_redirected_url: :return: True if url is insert, if url exists False """ if self.urls.find({ "url": url.toString(), "session": current_session }).count() > 0 or self.urls.find({ "redirected_to": url.toString(), "session": current_session }).count() > 0: return False document = self._url_to_doc_without_abstract_url(url) document['session'] = current_session document["url_counter"] = self._per_session_url_counter self._per_session_url_counter += 1 self.urls.save(document) return True def _url_to_doc_without_abstract_url(self, url): doc = {} doc["url"] = url.complete_url #doc["abstract_url"] = url.abstract_url doc["url_hash"] = url.url_hash doc["page_id"] = None doc["visited"] = False doc["response_code"] = None doc['redirected_to'] = None doc['depth_of_finding'] = url.depth_of_finding return doc def get_next_url_for_crawling(self, current_session): urls = self.urls.find({ "session": current_session, "visited": False }).sort([('url_counter', pymongo.ASCENDING)]).limit(1) if urls is None or urls.count() == 0: return None else: url = self._parse_url_from_db(urls[0]) url.url_structure = self.get_url_structure_from_db( current_session, url.url_hash) return url def get_all_unvisited_urls_sorted_by_hash(self, current_session): """ @:returns dict(url_hash) = list(urls) """ raw_data = self.urls.find({ "session": current_session, "visited": False }) result = {} for url in raw_data: tmp = self._parse_url_from_db_withou_abstract_url(url) tmp.url_structure = self.get_url_structure_from_db( current_session, tmp.url_hash) if tmp.url_hash in result: result[tmp.url_hash].append(tmp) else: result[tmp.url_hash] = [tmp] return result def _parse_url_from_db_withou_abstract_url(self, url): result = Url(url['url'], url['depth_of_finding']) result.parameters = self.unescape_unloved_signs(result.parameters) return result def _parse_url_from_db(self, url): result = Url(url['url'], url['depth_of_finding']) result.parameters = self.unescape_unloved_signs(result.parameters) result.abstract_url = url["abstract_url"] return result def get_urls_from_db_to_hash(self, current_session, url_hash): urls = self.urls.find({ "session": current_session, "url_hash": url_hash }) result = [] url_description = self.get_url_structure_from_db( current_session, url_hash) for url in urls: url = self._parse_url_from_db(url) url.url_description = url_description result.append(url) return result def visit_url(self, current_session, url, webpage_id, response_code, redirected_to=None): search_doc = {} try: search_doc['url'] = url.toString() except AttributeError: search_doc['url'] = url search_doc['session'] = current_session update_doc = { 'response_code': response_code, 'visited': True, 'page_id': webpage_id, 'redirected_to': redirected_to } self.urls.update(search_doc, {"$set": update_doc}) def count_visited_urls_per_hash(self, current_session, url_hash): all_urls = self.urls.find({ "session": current_session, "url_hash": url_hash, "visited": True }) counter = 0 for url in all_urls: if url['response_code'] > 100 and url['response_code'] < 1000: counter += 1 return counter def get_url_to_id(self, current_session, id): result = self.urls.find_one({ "session": current_session, "page_id": id }) if result is None: return None return result['url'] def insert_page_into_db(self, current_session, web_page): for clickable in web_page.clickables: self._insert_clickable_into_db(current_session, web_page.id, clickable) for form in web_page.forms: self.insert_form(current_session, form, web_page.id) document = self._create_webpage_doc(web_page, current_session) document['ajax_requests'] = [] document['session'] = current_session self.pages.save(document) def get_all_pages(self, current_session): results = [] pages = self.pages.find({"session": current_session}) for page in pages: results.append( self._get_web_page_from_db(current_session=current_session, page=page)) return results def get_webpage_to_url_from_db(self, current_session, url): return self._get_web_page_from_db(current_session, url=url) def get_webpage_to_id_from_db(self, current_session, id): return self._get_web_page_from_db(current_session, page_id=id) def url_visited(self, current_session, url): try: url = url.toString() except AttributeError: url = url return self.pages.find_one({ "session": current_session, "url": url, "visited": True }) is not None def _get_web_page_from_db(self, current_session, page_id=None, url=None, page=None): if page is None: if page_id is not None: page = self.pages.find_one({ "session": current_session, "web_page_id": page_id }) elif url is not None: page = self.pages.find_one({ "session": current_session, "url": url }) else: raise AttributeError( "You must specifies either page_id or url") if page is None: return None clickables = self.get_all_clickables_to_page_id_from_db( current_session, page['web_page_id']) forms = self.get_all_forms_to_page_id_from_db(current_session, page['web_page_id']) result = WebPage(page['web_page_id'], page['url'], page['html'], None, page['current_depth'], page['base_url']) result.clickables = clickables result.forms = forms links = [] for link in page['links']: links.append(self._parse_link_from_db(link)) result.links = links timemimg_requests = [] for request in page['timing_requests']: timemimg_requests.append( self.get_asyncrequest_to_id(current_session, request)) result.timing_requests = timemimg_requests ajax = [] for request in page['ajax_requests']: ajax.append(self.get_asyncrequest_to_id(current_session, request)) result.ajax_requests = ajax return result def insert_asyncrequest(self, current_session, ajax_request, web_page_id): url_doc = { "url": ajax_request.url.complete_url, "abstract_url": ajax_request.url.abstract_url, "url_hash": ajax_request.url.url_hash } structure_doc = {} structure_doc['request_hash'] = ajax_request.request_hash structure_doc["session"] = current_session structure_doc['parameters'] = ajax_request.request_structure.parameters previous_doc = self.async_request_structure.find_one({ "session": current_session, "request_hash": ajax_request.request_hash }) if previous_doc is not None: structure_doc["_id"] = previous_doc['_id'] res = self.async_request_structure.save(structure_doc) doc = {} doc["request_hash"] = ajax_request.request_hash doc["url"] = url_doc doc["method"] = ajax_request.method doc["session"] = current_session try: trigger_id = self.clickables.find_one({ "session": current_session, "dom_address": ajax_request.trigger.dom_address, "web_page_id": web_page_id, "event": ajax_request.trigger.event }) trigger_id = trigger_id["_id"] doc["trigger"] = trigger_id except AttributeError: try: doc['event'] = ajax_request.event except AttributeError: logging.debug("This should never happen...") except TypeError: find_string = "session: " + str( current_session) + " - dom_address: " + str( ajax_request.trigger.dom_address ) + " - web_page_id: " + str(web_page_id) + " - event: " + ( ajax_request.trigger.event) logging.debug("Try to find: {}".format(find_string)) doc['parameters'] = ajax_request.parameters return self.async_requests.save(doc) def get_asyncrequest_to_id(self, current_session, async_id): raw_data = self.async_requests.find_one({ "session": current_session, "_id": async_id }) if raw_data is None: return None raw_structure = self.async_request_structure.find_one({ "session": current_session, "request_hash": raw_data['request_hash'] }) structure = AsyncRequestStructure(raw_structure['request_hash'], raw_structure['parameters']) url = Url(raw_data['url']['url']) url.abstract_url = raw_data['url']['abstract_url'] if "event" in raw_data: tmp = TimingRequest(raw_data['method'], url, None, raw_data['event'], parameters=raw_data['parameters']) tmp.request_structure = structure else: trigger = self.clickables.find_one({"_id": raw_data['trigger']}) trigger = self._parse_clickable_from_db_to_model(trigger) tmp = AjaxRequest(raw_data['method'], url, trigger, parameters=raw_data['parameters']) tmp.request_structure = structure return tmp def _parse_clickable_from_db_to_model(self, clickable): c = Clickable(clickable['event'], clickable['tag'], clickable['dom_address'], clickable['html_id'], clickable['html_class'], clickable['clickable_depth'], clickable['function_id']) c.clicked = clickable['clicked'] c.clickable_type = self._num_to_clickable_type( clickable['clickable_type']) c.links_to = clickable['links_to'] c.clickable_depth = clickable['clickable_depth'] return c def _insert_clickable_into_db(self, current_session, web_page_id, clickable): document = {} document["event"] = clickable.event document["tag"] = clickable.tag document["html_class"] = clickable.html_class document["html_id"] = clickable.id document["dom_address"] = clickable.dom_address document["links_to"] = clickable.links_to document["clicked"] = clickable.clicked document["clickable_type"] = self._clickable_type_to_num( clickable.clickable_type) document["web_page_id"] = web_page_id document["function_id"] = clickable.function_id document["clickable_depth"] = clickable.clickable_depth if hasattr(clickable, "random_char"): document['random_char'] = clickable.random_char document['session'] = current_session self.clickables.save(document) def insert_delta_page_into_db(self, current_session, delta_page): for clickable in delta_page.clickables: self._insert_clickable_into_db(current_session, delta_page.id, clickable) for form in delta_page.forms: self.insert_form(current_session, form, delta_page.id) document = self._create_webpage_doc(delta_page, current_session) clickable_id = self.clickables.find_one({ "session": current_session, "web_page_id": delta_page.parent_id, "dom_address": delta_page.generator.dom_address, "event": delta_page.generator.event }) clickable_id = clickable_id["_id"] document['generator'] = clickable_id generator_request_doc = [] for r in delta_page.generator_requests: generator_request_doc.append( self.insert_asyncrequest(current_session, r, delta_page.parent_id)) document["generator_requests"] = generator_request_doc document['delta_depth'] = delta_page.delta_depth document['parent_id'] = delta_page.parent_id ajax_request_docs = [] for ajax in delta_page.ajax_requests: ajax_request_docs.append( self.insert_asyncrequest(current_session, ajax, delta_page.id)) document["ajax_requests"] = ajax_request_docs document['session'] = current_session self.delta_pages.save(document) def get_delta_page_to_id(self, current_session, page_id): page = self.delta_pages.find_one({ "session": current_session, "web_page_id": page_id }) if page is None: return None result = self._parse_delta_page_from_db(current_session, page) return result def _create_webpage_doc(self, web_page, current_session): document = {} document["web_page_id"] = web_page.id document["url"] = web_page.url document["html"] = web_page.html document["links"] = [] for link in web_page.links: document["links"].append(self._parse_link_to_db_doc(link)) timeming_requests_doc = [] for timing_request in web_page.timing_requests: timeming_requests_doc.append( self.insert_asyncrequest(current_session, timing_request, web_page.id)) document['timing_requests'] = timeming_requests_doc document["current_depth"] = web_page.current_depth document['base_url'] = web_page.base_url return document def insert_form(self, current_session, form, page_id): form_hash = form.get_hash() result = self.forms.find_one({ "form_hash": form_hash, "session": current_session, "web_page_id": page_id }) form_doc = {} if result is not None: for parameter_from_db_form in result['parameters']: for parameter_from_new_form in form.parameter: if parameter_from_db_form[ 'name'] == parameter_from_new_form.name: parameter_from_new_form.values.extend( parameter_from_db_form['values']) parameter_from_new_form.values = sorted( set(parameter_from_new_form.values), key=lambda x: parameter_from_new_form.values.index( x)) #Deduplicates the list form_doc['_id'] = result["_id"] form_doc["web_page_id"] = page_id form_doc["method"] = form.method action_doc = { "url": form.action.complete_url, "abstract_url": form.action.abstract_url, "url_hash": form.action.url_hash } form_doc["action"] = action_doc form_doc["dom_address"] = form.dom_address param_doc = [] for parameter in form.parameter: param_doc.append(self._parse_form_parameter_to_db_doc(parameter)) form_doc['parameters'] = param_doc form_doc['session'] = current_session form_doc['form_hash'] = form_hash self.forms.save(form_doc) def _parse_link_to_db_doc(self, link): res = {} url = { "url": link.url.complete_url, "abstract_url": link.url.abstract_url, "url_hash": link.url.url_hash, "depth_of_finding": link.url.depth_of_finding } res['url'] = url res['dom_address'] = link.dom_address res['html_id'] = link.html_id res['html_class'] = link.html_class return res def _parse_link_from_db(self, link): url = Url(link['url']['url']) url.abstract_url = link['url']['abstract_url'] url.depth_of_finding = link['url']['depth_of_finding'] result = Link(url, link['dom_address'], link['html_id'], link['html_class']) return result def _parse_form_parameter_to_db_doc(self, form_parameter): param = {} param["tag"] = form_parameter.tag param["name"] = form_parameter.name param["values"] = form_parameter.values param["input_type"] = form_parameter.input_type return param def set_clickable_clicked(self, current_session, web_page_id, clickable_dom_address, clickable_event, clickable_depth=None, clickable_type=None, links_to=None): search_doc = {} set_doc = {} search_doc["web_page_id"] = web_page_id search_doc["dom_address"] = clickable_dom_address search_doc["event"] = clickable_event search_doc['session'] = current_session if clickable_type is not None: clickable_type = self._clickable_type_to_num(clickable_type) set_doc["clickable_type"] = clickable_type if links_to is not None: set_doc['links_to'] = links_to if clickable_depth is not None: set_doc['clickable_depth'] = clickable_depth set_doc["clicked"] = True set_doc = {"$set": set_doc} result = self.clickables.update(search_doc, set_doc) return result def set_clickable_ignored(self, current_session, web_page_id, clickable_dom_address, clickable_event, clickable_depth=None, clickable_type=None): search_doc = {} set_doc = {} search_doc["web_page_id"] = web_page_id search_doc["dom_address"] = clickable_dom_address search_doc["event"] = clickable_event search_doc['session'] = current_session if clickable_type is not None: clickable_type = self._clickable_type_to_num(clickable_type) set_doc["clickable_type"] = clickable_type if clickable_depth is not None: set_doc['clickable_depth'] = clickable_depth set_doc["clicked"] = "False" set_doc = {"$set": set_doc} result = self.clickables.update(search_doc, set_doc) return result def extend_ajax_requests_to_webpage(self, current_session, webpage, ajax_requests): ajax_requests_doc = [] for r in ajax_requests: ajax_requests_doc.append( self.insert_asyncrequest(current_session, r, webpage.id)) if not hasattr(webpage, 'parent_id'): result = self.pages.update( { "web_page_id": webpage.id, "session": current_session }, {"$addToSet": { "ajax_requests": { "$each": ajax_requests_doc } }}) else: result = self.delta_pages.update( { "web_page_id": webpage.id, "session": current_session }, {"$addToSet": { "ajax_requests": { "$each": ajax_requests_doc } }}) def _clickable_type_to_num(self, clickable_type): if clickable_type == ClickableType.UIChange: return 0 if clickable_type == ClickableType.Link: return 1 if clickable_type == ClickableType.CreatesNewNavigatables: return 2 if clickable_type == ClickableType.Error: return 3 if clickable_type == ClickableType.SendingAjax: return 4 if clickable_type == ClickableType.IgnoredByCrawler: return 5 if clickable_type == ClickableType.UnsupportedEvent: return 6 def _num_to_clickable_type(self, num): if num is None: return None clickable_types = { 0: ClickableType.UIChange, 1: ClickableType.Link, 2: ClickableType.CreatesNewNavigatables, 3: ClickableType.Error, 4: ClickableType.SendingAjax, 5: ClickableType.IgnoredByCrawler, 6: ClickableType.UnsupportedEvent } return clickable_types[num] def get_all_clickables_to_page_id_from_db(self, current_session, page_id): clickables = self.clickables.find({ "web_page_id": page_id, "session": current_session }) result = [] for clickable in clickables: c = Clickable(clickable['event'], clickable['tag'], clickable['dom_address'], clickable['html_id'], clickable['html_class'], clickable_depth=clickable['clickable_depth'], function_id=clickable['function_id']) c.links_to = clickable['links_to'] c.clickable_type = self._num_to_clickable_type( clickable['clickable_type']) c.clicked = clickable['clicked'] result.append(c) return result def get_all_forms_to_page_id_from_db(self, current_session, page_id): forms = self.forms.find({ "web_page_id": page_id, "session": current_session }) result = [] for form in forms: result.append(self._parse_form_from_db(form)) return result def _parse_form_from_db(self, form): parameters = [] for p in form['parameters']: form_input = FormInput(p['tag'], p['name'], p['input_type'], p['values']) parameters.append(form_input) action = Url(form['action']["url"]) action.abstract_url = form['action']["abstract_url"] return HtmlForm(parameters, action, form['method'], form["dom_address"]) def get_all_crawled_deltapages_to_url_from_db(self, current_session, url): pages = self.delta_pages.find({"url": url, "session": current_session}) result = [] for page in pages: result.append(self._parse_delta_page_from_db( current_session, page)) return result def _parse_delta_page_from_db(self, current_session, page): clickables = self.get_all_clickables_to_page_id_from_db( current_session, page['web_page_id']) forms = self.get_all_forms_to_page_id_from_db(current_session, page['web_page_id']) generator = self.clickables.find_one({"_id": page['generator']}) generator = self._parse_clickable_from_db_to_model(generator) generator_requests = [] for g in page['generator_requests']: generator_requests.append( self.get_asyncrequest_to_id(current_session, g)) result = DeltaPage(page['web_page_id'], page['url'], page['html'], None, page['current_depth'], generator, page['parent_id'], page['delta_depth']) links = [] for link in page['links']: links.append(self._parse_link_from_db(link)) result.links = links result.forms = forms result.clickables = clickables result.generator_requests = generator_requests return result def insert_url_structure_into_db(self, current_session, url_description): search_doc = {"hash": url_description.url_hash} result = self.url_descriptions.find_one({ "session": current_session, "url_hash": url_description.url_hash }) document = {} if result is not None: document["_id"] = result["_id"] document["path"] = url_description.path document["parameters"] = self.escape_unloved_signs( url_description.parameters) document["url_hash"] = url_description.url_hash document["session"] = current_session result = self.url_descriptions.save(document) def escape_unloved_signs(self, item): if isinstance(item, dict): for old_key in item: item[old_key.replace(".", "$$$")] = item.pop(old_key) return item def unescape_unloved_signs(self, item): if isinstance(item, dict): for old_key in item: item[old_key.replace("$$$", ".")] = item.pop(old_key) return item def get_url_structure_from_db(self, current_session, url_hash): result = self.url_descriptions.find_one({ "session": current_session, "url_hash": url_hash }) if result is None: return None return UrlStructure(result['path'], result["parameters"], result['url_hash']) def url_exists(self, current_session, url): try: search_url = url.toString() except AttributeError: search_url = url return self.urls.find({ "url": search_url, "session": current_session }).count() > 0 def write_cluster(self, current_session, url_hash, clusters): self.clusters.remove({ "session": current_session, "url_hash": url_hash }) self.clusters.save({ "session": current_session, "url_hash": url_hash, "clusters": clusters }) def get_clusters(self, current_session, url_hash): result = self.clusters.find_one({ "session": current_session, "url_hash": url_hash }) try: return result["clusters"] except TypeError: return None def get_all_url_structures(self, current_session): raw_data = self.url_descriptions.find({"session": current_session}) result = [] for url_structure in raw_data: result.append( UrlStructure(url_structure['path'], url_structure["parameters"], url_structure['url_hash'])) return result def get_all_successfully_visited_urls(self, current_session): raw_data = self.urls.find({ "session": current_session, "visited": True }) result = [] for url in raw_data: if url["response_code"] > 0 and url['response_code'] < 1000: result.append(self._parse_url_from_db(url)) return result def get_one_visited_url_per_structure(self, current_session): raw_data = self.urls.find({ "session": current_session, "visited": True }) result = [] seen_structures = [] for url in raw_data: if url["response_code"] > 0 and url['response_code'] < 1500: tmp = self._parse_url_from_db_withou_abstract_url(url) if tmp.url_hash not in seen_structures: result.append(tmp) seen_structures.append(tmp.url_hash) return result def insert_attack_result(self, current_session, result, attack_url): doc = { "session": current_session, "attack_url": attack_url, "result": result.value } self.attack.save(doc) def get_asyncrequest_structure(self, current_session, structure_hash=None): if structure_hash is not None: raw_data = self.async_request_structure.find_one({ "session": current_session, "request_hash": structure_hash }) if raw_data is None: return None return AsyncRequestStructure(raw_data['request_hash'], raw_data['parameters']) else: return None #TODO: Implement if I need all def get_all_get_forms(self, current_session): raw_data = self.forms.find({ "session": current_session, "method": "get" }) result = [] for form in raw_data: tmp = self._parse_form_from_db(form) if tmp not in result: result.append(tmp) return result def _exclude_dupplicates_forms(self, forms): result = [] for form in forms: already_in = False for res in result: if self._form_paramters_equal( form.parameter, res.parameter ) and form.action.complete_url == res.action.complete_url: already_in = True break if not already_in: result.append(form) return result def get_one_form_per_destination(self, current_session): all_forms = self.get_all_get_forms(current_session) all_forms = self._exclude_dupplicates_forms(all_forms) return all_forms def _form_paramters_equal(self, p1, p2): for param in p1: is_in = False for param2 in p2: if param.input_type == param2.input_type and param.name == param2.name and param.tag == param2.tag: is_in = True break if not is_in: return False return True def num_of_ignored_urls(self, current_session, url_hash): return self.urls.find({ "url_hash": url_hash, "session": current_session, "response_code": 0 }).count() def get_id_to_url(self, current_session, url): """ :param current_session: :param url: :return: PageId if available, if not -1... I cannot use None here, because it might happen that the Id is None because of errors or 500 codes or other reasons... """ raw_data = self.urls.find_one({"session": current_session, "url": url}) if raw_data is not None: return raw_data["page_id"] raw_data = self.urls.find_one({ "session": current_session, "redirected_to": url }) if raw_data is not None: return raw_data["page_id"] return -1 def get_all_urls_to_domain(self, current_session, domain): raw_data = self.urls.find({ "session": current_session, "visited": True }) result = [] for url in raw_data: if domain in url["url"]: result.append(self._parse_url_from_db_withou_abstract_url(url)) return result