Beispiel #1
0
 def _parse_link_from_db(self, link):
     url = Url(link['url']['url'])
     url.abstract_url = link['url']['abstract_url']
     url.depth_of_finding = link['url']['depth_of_finding']
     result = Link(url, link['dom_address'], link['html_id'],
                   link['html_class'])
     return result
    def __runDetector(self, id, url, is_new):
        craigslistSite = requests.get(url)
        search = bs4.BeautifulSoup(craigslistSite.text, 'html.parser')
        links = search.find_all("a")

        matchFound = False
        for link in links:
            textFormat = str(link)
            if ("data-id=" not in textFormat or "sandiego" not in textFormat):
                continue
            matchFound = True
            beginningIndex = textFormat.find("href=") + 6
            endingIndex = textFormat.find(".html") + 5
            hyperlink = textFormat[beginningIndex:endingIndex]

            if (is_new == True):
                inputData = {}
                inputData["hyperlink"] = hyperlink
                inputData["keywords"] = id
                inputData["is_deleted"] = False
                if Url.find_url_by_hyperlink(inputData['hyperlink'],
                                             inputData['keywords']):
                    return
                Url(**inputData).save()

        if not matchFound:
            DetectorManager.__job_scheduler.add_job(
                self.__runDetector,
                'date',
                run_date=self.__getRuntime(),
                args=(id, url, is_new))
Beispiel #3
0
 def get_asyncrequest_to_id(self, current_session, async_id):
     raw_data = self.async_requests.find_one({
         "session": current_session,
         "_id": async_id
     })
     if raw_data is None:
         return None
     raw_structure = self.async_request_structure.find_one({
         "session":
         current_session,
         "request_hash":
         raw_data['request_hash']
     })
     structure = AsyncRequestStructure(raw_structure['request_hash'],
                                       raw_structure['parameters'])
     url = Url(raw_data['url']['url'])
     url.abstract_url = raw_data['url']['abstract_url']
     if "event" in raw_data:
         tmp = TimingRequest(raw_data['method'],
                             url,
                             None,
                             raw_data['event'],
                             parameters=raw_data['parameters'])
         tmp.request_structure = structure
     else:
         trigger = self.clickables.find_one({"_id": raw_data['trigger']})
         trigger = self._parse_clickable_from_db_to_model(trigger)
         tmp = AjaxRequest(raw_data['method'],
                           url,
                           trigger,
                           parameters=raw_data['parameters'])
         tmp.request_structure = structure
     return tmp
Beispiel #4
0
 def _parse_form_from_db(self, form):
     parameters = []
     for p in form['parameters']:
         form_input = FormInput(p['tag'], p['name'], p['input_type'], p['values'])
         parameters.append(form_input)
     action = Url(form['action']["url"])
     action.abstract_url = form['action']["abstract_url"]
     return HtmlForm(parameters, action, form['method'], form["dom_address"])
Beispiel #5
0
 def _parse_form_from_db(self, form):
     parameters = []
     for p in form['parameters']:
         form_input = FormInput(p['tag'], p['name'], p['input_type'],
                                p['values'])
         parameters.append(form_input)
     action = Url(form['action']["url"])
     action.abstract_url = form['action']["abstract_url"]
     return HtmlForm(parameters, action, form['method'],
                     form["dom_address"])
Beispiel #6
0
    def post(self):
        data = ApiUrl.parser.parse_args()
        print('you are here')
        url = Url(data['login_url'], data['visit_url'])
        try:
            url.save_to_db()
        except:
            return {'message' : 'Failed to save'}, 500

        return url.json(), 201
Beispiel #7
0
    def test_url_set(self):
        url1 = Url(TEST_URL1, depth_of_finding=3)
        url2 = Url(TEST_URL2, depth_of_finding=25)

        self.database.insert_url_into_db(SESSION, url1)
        self.assertEqual(self.database.urls.count(), 1)
        self.database.insert_url_into_db(SESSION, url1)
        self.assertEqual(self.database.urls.count(), 1)
        self.database.insert_url_into_db(SESSION, url2)
        self.assertEqual(self.database.urls.count(), 2)
Beispiel #8
0
def _url_collector(entry_url):
    url = Url(entry_url)
    url_list = url.get_urls_from_entry_url()

    if url_list is None:
        return None

    data = {'entry_url': url.url, 'url_list': url_list}
    data = db.create_document(data)

    return data
    def handle_url(self, new_url, requested_url):
        if not isinstance(new_url, Url):
            new_url = Url(new_url)
        if requested_url is not None:
            if not isinstance(requested_url, Url):
                requested_url = Url(requested_url)
            new_url.abstract_url = self.calculate_abstract_url(new_url, requested_url)

        if not self.database_manager.url_exists(new_url):
            new_url.url_structure = self.calculate_url_structure(new_url)
        return new_url
Beispiel #10
0
    def test_url_visit(self):
        url1 = Url(TEST_URL1, depth_of_finding=3)
        url2 = Url(TEST_URL2, depth_of_finding=25)

        self.database.insert_url_into_db(SESSION, url1)
        self.database.insert_url_into_db(SESSION, url2)

        url3 = self.database.get_next_url_for_crawling(SESSION)
        self.database.visit_url(SESSION, url3, 25, 200)
        url4 = self.database.get_next_url_for_crawling(SESSION)

        self.assertEqual(url1, url3)
        self.assertEqual(url2, url4)
Beispiel #11
0
def post_url():

    data = request.get_json()
    url = Url(user_id=get_user(), **data)
    db.session.add(url)
    db.session.commit()

    return jsonify(urls=[str(url)])
Beispiel #12
0
 def get_asyncrequest_to_id(self, current_session, async_id):
     raw_data = self.async_requests.find_one({"session": current_session, "_id": async_id})
     if raw_data is None:
         return None
     raw_structure = self.async_request_structure.find_one({"session": current_session, "request_hash": raw_data['request_hash']})
     structure = AsyncRequestStructure(raw_structure['request_hash'], raw_structure['parameters'])
     url = Url(raw_data['url']['url'])
     url.abstract_url = raw_data['url']['abstract_url']
     if "event" in raw_data:
         tmp = TimingRequest(raw_data['method'], url, None, raw_data['event'], parameters=raw_data['parameters'])
         tmp.request_structure = structure
     else:
         trigger = self.clickables.find_one({"_id": raw_data['trigger']})
         trigger = self._parse_clickable_from_db_to_model(trigger)
         tmp = AjaxRequest(raw_data['method'], url, trigger, parameters=raw_data['parameters'])
         tmp.request_structure = structure
     return tmp
Beispiel #13
0
 def put(self, id):
     data = ApiUrl.parser.parse_args()
     url = Url.find_by_id(id)
     if url is None:
         url = Url(data['login_url'], data['visit_url'])
     else:
         url.login_url = data['login_url']
         url.visit_url = data['visit_url']
     url.save_to_db()
     return url.json()
Beispiel #14
0
    def get(self):
        inputData = UrlController.__getParser.parse_args()

        return {
            'urls':
            list(
                map(lambda x: x.json(),
                    Url.find_url_by_keywords(inputData['keywords'])))
        }
Beispiel #15
0
def bfs(starting_url):
    queue = [starting_url]

    while queue:
        url_string = queue.pop(0)
        try:
            if not is_url_visited(url_string):
                url = Url(url_string=url_string)
                add_entry_to_db(url)

            domain = create_domain(url_string=url_string)
            add_entry_to_db(domain)

            queue.extend(get_urls_and_add_server(url_string))
        except Exception as e:
            pass
Beispiel #16
0
def parse_row(row):
    if len(row) > 0:

        arguments = row.split(',')

        # arguments are :
        # argument[0] = latitude
        # argument[1] = longitude
        # argument[2] = list of urls

        arguments[2] = arguments[2][1:-1]
        urls = arguments[2].split('|')

        return Url(arguments[0], arguments[1], urls)

    else:
        return None
Beispiel #17
0
 def add_webpage_to_cluster(self, webpage):
     url = Url(webpage.url)
     clusters = self._persistence_manager.get_clusters(url.url_hash)
     if clusters is None:
         #self._clusters[url.url_hash] = [webpage.id]
         self._persistence_manager.write_clusters(url.url_hash,
                                                  [webpage.id])
     else:
         tmp = []
         for c in clusters:
             if isinstance(c, list):
                 tmp.extend(c)
             else:
                 tmp.append(c)
         tmp.append(webpage.id)
         new_clusters = self.hierarchical_clustering(tmp, CLUSTER_THRESHOLD)
         for c in new_clusters:
             if isinstance(
                     c, int
             ):  # Konvert integer to list, so mongo store all seperate single clusters in its own lists.
                 new_clusters.remove(c)
                 new_clusters.insert(0, [c])
         self._persistence_manager.write_clusters(url.url_hash,
                                                  new_clusters)
 def attack_single_url(self, url, replacement=False):
     if not replacement:
         attack_url = url
         result, response_code = self._xss.attack(attack_url, "123")
         logging.debug("Result: {}".format(result))
         return
     url = Url(url)
     for parameter_to_attack in url.parameters:
         for vector in self._xss_vector.attack_vectors:
             attack_url = url.scheme + "://" + url.domain + url.path + "?"
             random_val = self._xss_vector.random_number_generator(12)
             ramdom_val = "123"
             for other_parameters in url.parameters:
                 if parameter_to_attack == other_parameters:
                     attack_url += other_parameters + "=" + vector.replace(
                         "XSS", random_val) + "&"
                 else:
                     attack_url += other_parameters + "=" + url.parameters[
                         other_parameters][0] + "&"
             attack_url = attack_url[:-1]  # Removing the last "&
             logging.debug("Attack with: {}".format(attack_url))
             result, response_code = self._xss.attack(
                 attack_url, random_val)
             logging.debug("Result: {}".format(result))
Beispiel #19
0
 def createUrl(self, url):
     result = Url(url)
     self.session.add(result)
     self.session.commit()
     self._actions.append(lambda: self._urlCreated(url))
     return result
Beispiel #20
0
    def delete(self):
        inputData = UrlController.__deleteParser.parse_args()
        url = Url.find_url_by_id(inputData['id'])
        url.deleteUrl()

        return {"message": "Posting deleted Successfully."}, 201
Beispiel #21
0
 def getUrl(self):
     return Url.find_by_id(Url, self.url_id)
Beispiel #22
0
 def get(self, id=None):
     url = Url.find_by_id(self, id)
     if url:
         return url.json()
     return {'message' : 'Logindata not found'}, 404
Beispiel #23
0
 def delete(self, id):
     url = Url.find_by_id()
     if url:
         url.delete_from_db()
Beispiel #24
0
 def test_url_set_and_get(self):
     url = Url(TEST_URL1, depth_of_finding=3)
     self.database.insert_url_into_db(SESSION, url)
     url2 = self.database.get_next_url_for_crawling(SESSION)
     self.assertEqual(url, url2)
     self.assertEqual(url2.depth_of_finding, 3)
Beispiel #25
0
 def get(self):
     urls = Url.find_all(Url)
     return {'urls' : [Url.json(myInput) for myInput in urls]}
Beispiel #26
0
 def _parse_url_from_db(self, url):
     result = Url(url['url'], url['depth_of_finding'])
     result.parameters = self.unescape_unloved_signs(result.parameters)
     result.abstract_url = url["abstract_url"]
     return result
Beispiel #27
0
 def _parse_link_from_db(self, link):
     url = Url(link['url']['url'])
     url.abstract_url = link['url']['abstract_url']
     url.depth_of_finding = link['url']['depth_of_finding']
     result = Link(url, link['dom_address'], link['html_id'], link['html_class'])
     return result
Beispiel #28
0
 def _parse_url_from_db(self, url):
     result = Url(url['url'], url['depth_of_finding'])
     result.parameters = self.unescape_unloved_signs(result.parameters)
     result.abstract_url = url["abstract_url"]
     return result
Beispiel #29
0
from unittest import TestCase
import mock
from models.url import Url

url = Url('www.google.com.br')


class MockResponse:
    def __init__(self, json_data, status_code):
        self.json_data = json_data
        self.status_code = status_code

    def json(self):
        return self.json_data

    def text(self):
        return self.json_data


class UrlTests(TestCase):
    @staticmethod
    def test_set_url_with_sanitize():
        assert url.url == 'http://www.google.com.br'

    @mock.patch('requests.get')
    def test_get_urls_from_entry_url_mock_requests(self, mock_get):
        mock_resp = self._mock_response(
            text='<html><a href="http://www.google.com">Google</a>'
            '<a href="https://gist.github.com">github</a></html>')
        mock_get.return_value = mock_resp