Esempio n. 1
0
def check(request):
    """ Check if the passed paramter named 'url' exists in DB,
    if not, start the spider, otherwise display result
    for the url. """

    url = request.POST.get('url', request.COOKIES.get('url', None))

    if not url:
        logger.debug('check received wrong request method.')
        messages.error(request, 'URL is missing!')
        return redirect('start')

    domain_name = get_domain_from_url(url)

    if Domains.objects.filter(domain__icontains=domain_name).exists():
        obj = Domains.objects.filter(domain__icontains=domain_name).first()
        msg = 'Domain: {} exists in DB.'.format(obj.domain)
        messages.info(request, msg)
        return redirect('display', domain=obj.domain)

    obj = Domains.objects.create(domain=domain_name, url=url)
    msg = 'Domain: {} was created.'.format(obj.domain)
    messages.success(request, msg)

    return redirect('display', domain=obj.domain)
Esempio n. 2
0
def add_domain_to_network(request, network_name):
    url = request.POST.get('url', None)
    if not url:
        logger.debug('check received wrong request method.')
        messages.error(request, 'URL is missing!')
        return redirect('start')

    domain_name = get_domain_from_url(url)

    obj = None
    if Domains.objects.filter(domain__icontains=domain_name).exists():
        obj = Domains.objects.filter(domain__icontains=domain_name).first()
    else:
        obj = Domains.objects.create(domain=domain_name, url=url)
        localhost = 'http://localhost:6800'
        scrapyd = ScrapydAPI(localhost)
        job_id = scrapyd.schedule('default', 'externalspider',
                                  started_by_domain=obj.domain,
                                  keywords=[])
        ExternalSpider.objects.create(domain=obj,
                                      job_id=job_id)
        obj.status = 'external_started'
        obj.save()

    nw = None
    if not Network.objects.filter(name=network_name).exists():
        msg = 'Network: {} not found!'.format(network_name)
        messages.warning(request, msg)
        return redirect('start')

    nw = Network.objects.filter(name=network_name).first()

    nw.domains.add(obj)
    return redirect('network', network_name=nw.name)
Esempio n. 3
0
    def _is_suspicious(self):
        domain = Domains.objects.get(domain=self.domain)
        my_domain = get_domain_from_url(self.url)
        if my_domain in domain.to_info_scan:
            return True

        if self._info() and self._info().is_suspicious:
            return True

        return False
Esempio n. 4
0
    def _recommend_name(self, item):
        domain = get_domain_from_url(item['url'])
        words = re.split(r'\W+', domain)
        common_top_level_domain = [
            'com',
            'org',
            'de',
            'eu',
            'net',
            'fr',
        ]
        for w in words:
            if w in common_top_level_domain:
                words.remove(w)
        match = {}

        tags = [
            'title', 'meta_og_title', 'meta_title', 'name', 'alternative_name'
        ]
        for tag in tags:
            if tag not in item or not item[tag]:
                continue
            compare = item[tag].split(' ')

            for w in words:
                for c in compare:
                    ratio = similar(w, c.lower())
                    if ratio > 0.7:
                        match[tag] = {
                            'ratio': ratio,
                            'w_domain': w,
                            'w_tag': c
                        }

        tmp = 0
        recommend = ''
        for key, values in match.items():
            if float(values['ratio']) > tmp:
                tmp = values['ratio']
                recommend = key
            elif values['ratio'] == tmp and tmp != 0.0:
                recommend = key if len(item[key]) < len(item[recommend]) \
                    else recommend

        return recommend
Esempio n. 5
0
 def errback_urls(self, failure):
     # https://stackoverflow.com/questions/13724730/how-to-get-the-scrapy-failure-urls
     # log all failures
     # self.logger.error(repr(failure))
     if failure.check(DNSLookupError):
         # this is the original request
         request = failure.request
         # self.logger.error('DNSLookupError on %s', request.url)
         url = 'http://www.' + \
             get_domain_from_url(request.url)
         request = scrapy.Request(url,
                                  callback=self.parse_urls,
                                  meta={
                                      'dont_retry': True,
                                      'domain': url
                                  })
         return request
     if failure.check(HttpError):
         # these exceptions come from HttpError spider middleware
         # you can get the non-200 response
         response = failure.value.response
         self.logger.info('HttpError on %s', response.url)
Esempio n. 6
0
    def parse_urls(self, response):
        item = {}
        # get title
        item['title'] = response.xpath('//title/text()').get()
        item['meta_title'] = response.xpath(
            "//meta[@name='title']/@content").get()
        item['meta_og_title'] = response.xpath(
            "//meta[@property='og:title']/@content").get()

        # clean title from startseite, home etc.
        for k, v in item.items():
            if not v:
                continue
            item[k] = self._clean_title(v)
        req_url = 'http://' + response.request._meta['download_slot']
        item['domain'] = get_domain_from_url(req_url)
        item['url'] = response.url

        # get description
        item['meta_description'] = response.xpath(
            "//meta[@name='description']/@content").get()
        item['meta_og_description'] = response.xpath(
            "//meta[@property='og:description']/@content").get()

        # get keywords
        item['meta_keywords'] = response.xpath(
            "//meta[@name='keywords']/@content").get()
        item['meta_og_keywords'] = response.xpath(
            "//meta[@property='og:keywords']/@content").get()

        # get imprint for finding potential zip and name of company
        imprint_keywords = [
            'impressum', 'imprint', 'legalnotices', 'privacy', 'privacy',
            'policy', 'legaldisclosure', 'corporate-info', 'terms-of-service',
            'contact', 'kontakt', 'about'
        ]
        imprint_link_extractor = LinkExtractor(
            allow=('/(?i)(' + '|'.join(imprint_keywords) + ')'),
            unique=True).extract_links(response)
        urls_with_imprint_keyword = [i.url for i in imprint_link_extractor]

        if len(urls_with_imprint_keyword) > 1:
            for key in imprint_keywords:
                for url in urls_with_imprint_keyword:
                    if key in url.lower():
                        item['imprint'] = url
                        request = scrapy.Request(item['imprint'],
                                                 callback=self.parse_impressum,
                                                 dont_filter=True,
                                                 errback=self.errback_urls)
                        request.meta['item'] = item
                        return request
        elif len(urls_with_imprint_keyword) == 1:
            item['imprint'] = urls_with_imprint_keyword[0]
            request = scrapy.Request(item['imprint'],
                                     callback=self.parse_impressum,
                                     dont_filter=True,
                                     errback=self.errback_urls)
            request.meta['item'] = item
            return request
        else:
            domain = item['domain']
            item['tip'] = self._recommend_name(item)
            self.domains[domain] = clean_dict(item)
            return self.domains[domain]
Esempio n. 7
0
def add(request):
    logger.debug('received post: {}'.format(request.POST))
    url = request.POST.get('url', request.COOKIES.get('url', None))
    name = request.POST.get('name', None)
    keywords = request.POST.get('keywords', None)

    if not url or not name:
        logger.debug('received post: {}'.format(request.POST))
        messages.error(request, 'Name/URL of network is missing! Retry.')
        return redirect('start')

    # getting or creating the related domain for url
    domain_name = get_domain_from_url(url)
    obj_domain = None
    if Domains.objects.filter(domain__icontains=domain_name).exists():
        obj_domain = Domains.objects.filter(domain__icontains=domain_name) \
            .first()
        msg = 'Domain: {} exists in DB.'.format(obj_domain.domain)
        # messages.info(request, msg)
        logger.info(msg)

    if not obj_domain:
        obj_domain = Domains.objects.create(domain=domain_name,
                                            url=url)
        msg = 'Domain: {} was created.'.format(obj_domain.domain)
        messages.success(request, msg)

        # TODO think, whether start spider specially here, or wait till
        # display of created domain is visited. For now decided network is
        # primarly for displaying DB results in graph and not for starting
        # specific scans. For this the user is adviced to visit the given
        # actor

    # getting / creating related network
    obj_network = None
    if Network.objects.filter(name=name).exists():
        obj_network = Network.objects.filter(name=name).first()
        msg = 'Network: {} exists in DB.'.format(obj_network)
        # messages.info(request, msg)
        logger.info(msg)
    else:
        obj_network = Network.objects.create(name=name,
                                             keywords=keywords)
        msg = 'Network: {} successfully created.'.format(obj_network)
        messages.success(request, msg)

    # eig: network.domains.add(create, related)
    # obj_network.domains.add(obj_domain, through_defaults={'related': True})
    obj_network.domains.add(obj_domain)

    # beatles.members.set([john, paul, ringo, george],
    #    through_defaults={'date_joined': date(1960, 8, 1)})
    # https://docs.djangoproject.com/en/dev/topics/db/models/#extra-fields-on-many-to-many-relationships

    # getting / setting relationship between network and url
    # rel, created = Relation.objects.get_or_create(domain=obj_domain,
    #                                               network=obj_network,
    #                                               related=True)
    # if created:
    #     msg = 'Domain {} is now related to Network {}'.format(obj_domain,
    #                                                           obj_network)
    #     messages.success(request, msg)

    # TODO redirect network/network_name
    return redirect('network', network_name=obj_network.name)
Esempio n. 8
0
 def _status(self):
     domain = get_domain_from_url(self.url)
     if not Domains.objects.filter(domain=domain).exists():
         return 'not created yet'
     obj = Domains.objects.get(domain=domain)
     return obj.status
Esempio n. 9
0
 def _is_being_crawled(self):
     domain = get_domain_from_url(self.url)
     if not Domains.objects.filter(domain=domain).exists():
         return False
     obj = Domains.objects.get(domain=domain)
     return obj.is_being_crawled
Esempio n. 10
0
 def _fullscan(self):
     domain = get_domain_from_url(self.url)
     if Domains.objects.filter(domain=domain).exists():
         return Domains.objects.get(domain=domain).fullscan
     return False
Esempio n. 11
0
 def _info(self):
     domain = get_domain_from_url(self.url)
     if Domains.objects.filter(domain=domain).exists():
         return Domains.objects.get(domain=domain).info
     return
Esempio n. 12
0
 def _get_domain(self):
     return get_domain_from_url(self.url)