Ejemplo n.º 1
0
def main():
    key = sys.argv[1]
    db = "../gsb_v4.db"
    platforms = ["WINDOWS"]
    sbl = SafeBrowsingList(key, db_path=db, platforms=platforms)
    #sbl.update_hash_prefix_cache()
    print(sbl.storage.get_threat_lists())

    url = sys.argv[2]
    u = URL(url)
    print(u.url)
    print(u.canonical)
    for i in u.url_permutations(u.canonical):
        print(i)
        print(u.digest(i))
    url_hashes = u.hashes
    print(url_hashes)

    full_hashes = list(url_hashes)
    print(full_hashes)

    cues = [to_hex(fh[0:4]) for fh in full_hashes]
    print(cues)

    print(sbl.storage.lookup_hash_prefix(cues))
    bl = sbl.lookup_url(url)
    print(bl)
Ejemplo n.º 2
0
def main():
    sbl = SafeBrowsingList(config.gsb_key, db_path=config.gsb_db_path)
    #result = sbl.lookup_url('http://www.amazon.esp.bravaidea.com/AWS/mobil/signin.php?https://www.amazon.com/gp/product/B00DBYBNEE/ref=nav_prime_try_btn')
    #print(result)
    dbo = db_operations.DBOperator()
    while True:
        slds = dbo.get_gsb_queryable_slds()
        urls = dbo.get_gsb_queryable_urls()
        domain_urls = slds+urls
        print(len(domain_urls))
        query_time = datetime.now()
        print ("GSB Update time:", str(query_time))
        run_sync(sbl)
        print ("Got updated GSB list. Now looking up %s domains: %s" % (
                    len(domain_urls), str(datetime.now())))
        for d in domain_urls:
            print(d)
            try:
                result = sbl.lookup_url(d)
                print(result)
                result = "%s" % (result,)
                dbo.update_gsb_table(d, result, query_time)
            except Exception as e:
                print ("Exception. Skipping this domain: ", d, e)
            #print result
        print ("Done inserting into DB. Will update GSB list again", str(datetime.now()))
        time.sleep(3600)
Ejemplo n.º 3
0
def main():
    total_attacks = 0
    sbl = SafeBrowsingList(gsb_key, db_path=gsb_db_path)
    all_home_domains = set()
    counted_hashes = set()
    for name, labels in se_categories.iteritems():
        camp_domains = set()
        camp_gsb_clusters = 0
        camp_total_count = 0
        home_domains_campaign_set = set()
        for label in labels:
            cluster_gsb = False
            cluster_domains = set()
            ad_objs = get_ad_objects(str(label))
            # camp_total_count += len(ad_objs)
            for ad_obj in ad_objs:
                domain = extractor(ad_obj.ad_url).registered_domain
                e = extractor(ad_obj.ad_url)
                land_fqd =  '.'.join(part for part in e if part) 
                # home_domain = extractor(ad_obj.home_url).registered_domain
                e = extractor(seed_domain_data[ad_obj.log_id][0])
                home_domain = '.'.join(part for part in e if part) 
                # camp_domains.add(domain)
                if ad_obj.screenshot_hash in image_domain_dict:
                # if domain in all_land_tlds:
                    if ad_obj.screenshot_hash not in counted_hashes:
                        mal_ad_hashes.add(ad_obj.screenshot_hash)
                        camp_total_count += image_hash_count[ad_obj.screenshot_hash]
                        counted_hashes.add(ad_obj.screenshot_hash)
                        # home_domains_campaign_set = home_domains_campaign_set.union(image_home_domain_dict[ad_obj.screenshot_hash])
                        home_domains_campaign_set = home_domains_campaign_set.union(image_seed_domain_dict[ad_obj.screenshot_hash])
                        camp_domains = camp_domains.union(image_domain_dict[ad_obj.screenshot_hash])
                        cluster_domains = cluster_domains.union(image_domain_dict[ad_obj.screenshot_hash])
                        # all_home_domains = all_home_domains.union(image_home_domain_dict[ad_obj.screenshot_hash])
                        all_home_domains = all_home_domains.union(image_seed_domain_dict[ad_obj.screenshot_hash])
                else:
                    print "!!Not here!!"
                    camp_domains.add(land_fqd)
                    cluster_domains.add(land_fqd)
                    camp_total_count += 1
                    # import ipdb; ipdb.set_trace()
                    all_home_domains.add(home_domain)
            for domain in cluster_domains:
                if not domain:
                    continue
                # print "domain:", domain
                result = sbl.lookup_url(domain.strip())
                if result:
                    cluster_gsb = True
            if cluster_gsb:
                camp_gsb_clusters += 1
            # print result
        print name, '\t&',  camp_total_count, '\t&', len(camp_domains), '\t&', len(labels), '\t&', camp_gsb_clusters, '\\\\'
        print len(home_domains_campaign_set)
        total_attacks += camp_total_count
    print "# of mal ad hashes:", len(mal_ad_hashes)
    print "# of unique publisher domains associated with SE ads:", len(all_home_domains)
    print "# of total SE attacks:", total_attacks
    # get_category_stats(all_home_domains)
    get_popularity_stats(all_home_domains)
Ejemplo n.º 4
0
def update_hash_prefix_cache():
    active = get_active()
    if active and active['ctime'] and active['mtime'] and min(
            active['ctime'], active['mtime']) >= (time.time() - (30 * 60)):
        # no need to update, active DB exists and is recent
        logger.info('active database is fresh')
        inactive = get_inactive()
        # remove inactivate database if it exists to free up disk space
        remove_inactive(inactive)
    else:
        # we need to update the inactive DB, so get its info and delete it
        inactive = get_inactive()
        remove_inactive(inactive)

        # download to temporary file name
        tmp_file = inactive['name'] + '.tmp'
        logger.info('downloading database to ' + tmp_file)
        sbl = SafeBrowsingList(gsb_api_key, tmp_file, True)
        sbl.update_hash_prefix_cache()
        logger.info("finished creating " + tmp_file)

        # rename to inactive file name
        if path.isfile(tmp_file + JOURNAL):
            rename(tmp_file + JOURNAL, inactive['name'] + JOURNAL)
            logger.info("renamed " + tmp_file + JOURNAL + ' to ' +
                        inactive['name'] + JOURNAL)
        rename(tmp_file, inactive['name'])
        logger.info("renamed " + tmp_file + ' to ' + inactive['name'])
Ejemplo n.º 5
0
def app_lookup(url):
    # input validation
    if not isinstance(url, (str, unicode)):
        abort(400)

    # resolve entries
    active = get_active()
    if not active or not active['mtime']:
        abort(503)
    try:
        sbl = SafeBrowsingList(gsb_api_key, active['name'], True)
        resp = sbl.lookup_url(url)
    except:
        app.logger.exception("exception handling [" + url + "]")
        abort(500)
    else:
        if resp:
            matches = [{
                'threat': x.threat_type,
                'platform': x.platform_type,
                'threat_entry': x.threat_entry_type
            } for x in resp]
            return jsonify({'url': url, 'matches': matches})
        else:
            abort(404)
Ejemplo n.º 6
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.moderators = self.plugin_config['MODERATORS']

        # Initialize Safe Browsing API
        if self.plugin_config['GOOGLE_SAFE_BROWSING']:
            self.sbl = SafeBrowsingList(
                self.plugin_config['GOOGLE_SAFE_BROWSING_API_KEY'])
            self.sbl.update_hash_prefix_cache()

        # Populate Blacklist from URLS
        for url in self.plugin_config['BLACKLISTS']:
            url = url.strip()
            if url.endswith('.json'):
                r = requests.get(url)
                # Assuming MEW List format
                for item in r.json():
                    self.blacklist.append(item['id'])

            elif url.endswidth('.csv'):
                print('csv not implemented')  # TODO
            else:
                print('txt not implement')  # TODO

        print(self.__class__.__name__, 'initialized')
Ejemplo n.º 7
0
def safebrowsingcheck(url):
    from gglsbl import SafeBrowsingList

    sbl = SafeBrowsingList('')

    if sbl.lookup_url(url) is None:
        return ':D Not in blacklist'
    else:
        return '@@ In the blacklist'
Ejemplo n.º 8
0
def update_hash_prefix_cache():
    logger.info('opening database at ' + dbfile)
    sbl = SafeBrowsingList(gsb_api_key, dbfile, True)

    logger.info('updating database at ' + dbfile)
    sbl.update_hash_prefix_cache()

    logger.info('checkpointing database at ' + dbfile)
    with sbl.storage.get_cursor() as dbc:
        dbc.execute('PRAGMA wal_checkpoint(FULL)')
    sbl.storage.db.commit()

    logger.info("all done!")
Ejemplo n.º 9
0
def google_sb_check(url):
    import ConfigParser
    config_file = "conf/parser.cfg"
    config = ConfigParser.RawConfigParser()
    config.read(config_file)
    apikey = config.get('url', 'gsb_api_key')

    from gglsbl import SafeBrowsingList
    sbl = SafeBrowsingList(apikey)

    if sbl.lookup_url(url) is None:
        return 'URL is not in the blacklist.'
    else:
        return '@@ URL is in the blacklist.'
Ejemplo n.º 10
0
class SafeBrowsing(object):
    TYPE = "GoogleSBCheck"
    CP_FMT = '{scheme}://{netloc}/{path}'

    def __init__(self,
                 name=None,
                 api_key=None,
                 db_path='/tmp/gsb_4.db',
                 update_hash_prefix_cache=False):
        self.api_key = api_key
        self.db_path = db_path

        self.sbl = SafeBrowsingList(api_key, db_path=db_path)
        self.update_hash_prefix_cache = update_hash_prefix_cache
        try:
            os.stat(db_path)
        except:
            self.update_hash_prefix_cache = True

        if self.update_hash_prefix_cache:
            # this may take a while so be patient (over 1600MB of data)
            self.sbl.update_hash_prefix_cache()

    def is_blacklisted(self, url):
        return not SafeBrowsing.thread_safe_lookup(url) is None

    def lookup_url(self, url):
        up = urlparse(url)
        cp = self.CP_FMT.format(**{
            'scheme': up.scheme,
            'netloc': up.netloc,
            'path': up.path
        }).strip('/') + '/'
        return self.sbl.lookup_url(cp)

    def handle_domain(self, domain):
        return self.handle_domains([
            domain,
        ])

    def handle_domains(self, domains):
        results = {}
        for domain in domains:
            t = "https://" + domain
            u = "http://" + domain
            results[domain] = False
            if self.lookup_url(t) or self.lookup_url(u):
                results[domain] = True
                continue
        return results
Ejemplo n.º 11
0
def _lookup(url, api_key, retry=1):
    # perform lookup
    global sbl, last_api_key
    try:
        if api_key != last_api_key:
            app.logger.info('re-opening database')
            sbl = SafeBrowsingList(api_key, dbfile, True)
            last_api_key = api_key
        return sbl.lookup_url(url)
    except:
        app.logger.exception("exception handling [" + url + "]")
        if retry >= max_retries:
            sbl = None
            last_api_key = None
            abort(500)
        else:
            return _lookup(url, api_key, retry + 1)
Ejemplo n.º 12
0
    def __init__(self,
                 api_key,
                 db_path=LINUX_DFT_PATH,
                 update_hash_prefix_cache=False):
        global API_KEY, DB_PATH
        API_KEY = api_key
        DB_PATH = db_path

        self.sbl = SafeBrowsingList(api_key, db_path=db_path)
        self.update_hash_prefix_cache = update_hash_prefix_cache
        try:
            os.stat(db_path)
        except:
            self.update_hash_prefix_cache = True

        if self.update_hash_prefix_cache:
            # this may take a while so be patient (over 1600MB of data)
            self.sbl.update_hash_prefix_cache()
Ejemplo n.º 13
0
    def __init__(self,
                 name=None,
                 api_key=None,
                 db_path='/tmp/gsb_4.db',
                 update_hash_prefix_cache=False):
        self.api_key = api_key
        self.db_path = db_path

        self.sbl = SafeBrowsingList(api_key, db_path=db_path)
        self.update_hash_prefix_cache = update_hash_prefix_cache
        try:
            os.stat(db_path)
        except:
            self.update_hash_prefix_cache = True

        if self.update_hash_prefix_cache:
            # this may take a while so be patient (over 1600MB of data)
            self.sbl.update_hash_prefix_cache()
Ejemplo n.º 14
0
class SafeBrowsing(object):
    def __init__(self,
                 api_key,
                 db_path=LINUX_DFT_PATH,
                 update_hash_prefix_cache=False):
        global API_KEY, DB_PATH
        API_KEY = api_key
        DB_PATH = db_path

        self.sbl = SafeBrowsingList(api_key, db_path=db_path)
        self.update_hash_prefix_cache = update_hash_prefix_cache
        try:
            os.stat(db_path)
        except:
            self.update_hash_prefix_cache = True

        if self.update_hash_prefix_cache:
            # this may take a while so be patient (over 1600MB of data)
            self.sbl.update_hash_prefix_cache()

    def is_blacklisted(self, url):
        return not SafeBrowsing.thread_safe_lookup(url) is None

    def lookup_url(self, url):
        # cp_fmt = '{scheme}://{netloc}/{path}'
        # up = URLPARSE(url)
        # cp = cp_fmt.format(**{'scheme':up.scheme, 'netloc':up.netloc, 'path':up.path}).strip('/')+'/'
        return self.sbl.lookup_url(url)

    @classmethod
    def init(cls, api_key):
        return SafeBrowsing(api_key)

    @staticmethod
    def set_global(api_key, db_path='/tmp/gsb_4.db'):
        global SB_CHECK, API_KEY, DB_PATH
        API_KEY = api_key, db_path
        SB_CHECK = SafeBrowsing(api_key, db_path=db_path)

    @staticmethod
    def thread_safe_lookup(url):
        global SB_CHECK
        sbl = SafeBrowsing(API_KEY, db_path=DB_PATH)
        return sbl.lookup_url(url)
Ejemplo n.º 15
0
def main():
    args_parser = setupArgsParser()
    args = args_parser.parse_args()
    setupLogger(args.log, args.debug)
    if args.check_url:
        sbl = SafeBrowsingList(args.api_key, db_path=args.db_path)
        bl = sbl.lookup_url(args.check_url)
        if bl is None:
            print('{} is not blacklisted'.format(args.check_url))
        else:
            print('{} is blacklisted in {}'.format(args.check_url, bl))
        sys.exit(0)
    if args.onetime:
        sbl = SafeBrowsingList(args.api_key, db_path=args.db_path, discard_fair_use_policy=True)
        run_sync(sbl)
    else:
        sbl = SafeBrowsingList(args.api_key, db_path=args.db_path)
        while True:
            run_sync(sbl)
Ejemplo n.º 16
0
def gsb_init():
    """

    Returns:

    """
    # noinspection PyBroadException
    try:
        return SafeBrowsingList(GSB_API_KEY, db_path=os.getcwd() + GSB_DB_NAME)
    except:
        return None
Ejemplo n.º 17
0
class Security:
    def __init__(self):
        self.sbl = SafeBrowsingList(GoogleConfig.SAFEBROWSINGAPIKEY)
        self.sbl.update_hash_prefix_cache()
        pass

    def validate_referer(self, url):
        threat_list = self.sbl.lookup_url(url)
        if threat_list is None:
            return None
        return threat_list

    def get_referer(self):
        referer = request.referrer
        if not referer:
            return None
        return referer

    @staticmethod
    def is_safe_url(url):
        ref_url = urlparse(request.host_url)
        test_url = urlparse(urljoin(request.host_url, url))
        return test_url.scheme in ('http', 'https') and \
               ref_url.netloc == test_url.netloc
Ejemplo n.º 18
0
def find_blacklisted_domains(urls):
    total_count=0
    count=0
    start_time = time.time()

    sbl = SafeBrowsingList('AIzaSyBHPCVVk-tbM0iC93uvulfEFTyBfmKecVA')
    #sbl = SafeBrowsingList('AIzaSyCj6PXcG8IuHW3cpVB5dZHVWHb2QnALWSU')
    for url in urls:
        threat_list = sbl.lookup_url(url)
        if threat_list:
            count+=1
            with open(r'blacklist.txt', 'a') as f:
               f.write(url)
        
        total_count+=1

    elapsed_time = time.time() - start_time
    print("Number of URLS:"+str(total_count))
    print("Phishing URLS:"+str(count))
    print(time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
    fields=[str(total_count),str(count)]
    with open(r'results.csv', 'a') as f:
       writer = csv.writer(f)
       writer.writerow(fields)
Ejemplo n.º 19
0
def main():
    args_parser = setupArgsParser()
    args = args_parser.parse_args()
    setupLogger(args.log, args.debug)
    storage_backend = None
    storage_config = None
    if args.mysql_db:
        storage_config = {
            'user': args.mysql_user,
            'password': args.mysql_password,
            'host': args.mysql_host,
            'database': args.mysql_db
        }
        storage_backend = SafeBrowsingList.STORAGE_BACKEND_MYSQL

    if args.check_url:
        sbl = SafeBrowsingList(args.api_key,
                               db_path=args.db_path,
                               timeout=args.timeout,
                               storage_backend=storage_backend,
                               storage_config=storage_config)
        bl = sbl.lookup_url(args.check_url)
        if bl is None:
            print('{} is not blacklisted'.format(args.check_url))
        else:
            print('{} is blacklisted in {}'.format(args.check_url, bl))
            sys.exit(args.blacklisted_return_code)
        sys.exit(0)
    if args.onetime:
        sbl = SafeBrowsingList(args.api_key,
                               db_path=args.db_path,
                               discard_fair_use_policy=True,
                               timeout=args.timeout,
                               storage_backend=storage_backend,
                               storage_config=storage_config)
        run_sync(sbl)
    else:
        sbl = SafeBrowsingList(args.api_key,
                               db_path=args.db_path,
                               timeout=args.timeout,
                               storage_backend=storage_backend,
                               storage_config=storage_config)
        run_sync(sbl)
Ejemplo n.º 20
0
def main():
    args_parser = setupArgsParser()
    args = args_parser.parse_args()
    setupLogger(args.log, args.debug)
    if args.check_url:
        sbl = SafeBrowsingList(args.api_key, db_path=args.db_path, timeout=args.timeout)
        bl = sbl.lookup_url(args.check_url)
        if bl is None:
            print('{} is not blacklisted'.format(args.check_url))
        else:
            print('{} is blacklisted in {}'.format(args.check_url, bl))
        sys.exit(0)
    if args.onetime:
        sbl = SafeBrowsingList(args.api_key, db_path=args.db_path, discard_fair_use_policy=True, timeout=args.timeout)
        run_sync(sbl)
    else:
        sbl = SafeBrowsingList(args.api_key, db_path=args.db_path, timeout=args.timeout)
        while True:
            run_sync(sbl)
Ejemplo n.º 21
0
    def worker(self, id, queue):

        with open(r'config\gglsbl.auth', 'r') as auth_file:
            gglsbl_key = auth_file.read().strip()

        sbl = SafeBrowsingList(gglsbl_key,
                               db_path=r"dataset\google_safe_browisng_db")
        # sbl.update_hash_prefix_cache()

        turn = True
        while True:

            # Update Google SBL database every 12 hours at time X (e.g. 3 AM and 3 PM)
            hour = datetime.datetime.today().hour
            if hour % 12 == 3 and turn:
                sbl.update_hash_prefix_cache()
                turn = False
            elif hour % 12 != 3:
                turn = True

            today = get_date()
            with open(os.path.join('results', today + '.ioc.csv'),
                      'a+',
                      encoding='utf_8') as output_file:
                tweet = queue.get()
                try:
                    if hasattr(tweet, 'retweeted_status') and hasattr(
                            tweet.retweeted_status, 'extended_tweet'
                    ) and 'full_text' in tweet.retweeted_status.extended_tweet:
                        text = tweet.retweeted_status.extended_tweet[
                            'full_text']
                    elif hasattr(tweet, 'extended_tweet'
                                 ) and 'full_text' in tweet.extended_tweet:
                        text = tweet.extended_tweet['full_text']
                    elif not hasattr(tweet, 'text'):
                        text = tweet['text']
                    else:
                        text = tweet.text

                    if hasattr(tweet, 'retweeted_status'):
                        if hasattr(tweet.retweeted_status, 'extended_tweet'):
                            final_urls = tweet.retweeted_status.extended_tweet[
                                'entities']['urls']
                        else:
                            final_urls = tweet.retweeted_status.entities[
                                'urls']
                    else:
                        if hasattr(tweet, 'extended_tweet'):
                            final_urls = tweet.extended_tweet['entities'][
                                'urls']
                        else:
                            final_urls = tweet.entities['urls']

                    for final_url in final_urls:
                        # If a pastebin URL, get the raw content and append it to the tweet content
                        if final_url['expanded_url'].startswith(
                                'https://pastebin.com/'):
                            pastebin = final_url['expanded_url']
                            if 'raw' not in pastebin:
                                pastebin = pastebin.replace(
                                    'https://pastebin.com/',
                                    'https://pastebin.com/raw/')

                            req = requests.get(pastebin)
                            text += '\n' + req.content

                    user_type = 'top'
                    if tweet.user.id_str in self.rand_users:
                        user_type = 'rand'

                    print(
                        "###########################$$$$$$$$$$$$$$$$$$$$$$$$$$$"
                    )
                    print(text)

                    # classifier must be retrained with new data
                    # vector = vectorize(text, self.wordlist)
                    # vector.append(len(tweet.entities['hashtags']))
                    # vector.append(len(tweet.entities['user_mentions']))
                    # vector = numpy.array(vector).reshape(1, -1)
                    # estimates = []
                    # for i in range(number_of_classifiers):
                    #     y_estimate = self.classifiers[i].predict(vector)
                    #     estimates.append(y_estimate)
                    # vote = statistics.mode([x[0] for x in estimates])
                    # print("Prediction: "+vote)

                    ips = list(iocextract.extract_ips(text, refang=True))
                    for ip in ips:
                        if ip not in text:
                            output_file.write('{},{},{},{},{},ip,{}\n'.format(
                                tweet.id, tweet.created_at, user_type,
                                tweet.user.id_str, tweet.user.screen_name, ip))

                    urls = list(iocextract.extract_urls(text, refang=True))
                    for url in urls:
                        if url not in text:
                            result = sbl.lookup_url(url.rstrip('.'))
                            if result is not None:
                                output_file.write(
                                    '{},{},{},{},{},url,{},{}\n'.format(
                                        tweet.id, tweet.created_at, user_type,
                                        tweet.user.id_str,
                                        tweet.user.screen_name,
                                        url.rstrip('.'), result))
                            else:
                                output_file.write(
                                    '{},{},{},{},{},url,{},benign\n'.format(
                                        tweet.id, tweet.created_at, user_type,
                                        tweet.user.id_str,
                                        tweet.user.screen_name,
                                        url.rstrip('.')))

                    emails = list(iocextract.extract_emails(text, refang=True))
                    for email in emails:
                        if email not in text:
                            output_file.write(
                                '{},{},{},{},{},email,{}\n'.format(
                                    tweet.id, tweet.created_at, user_type,
                                    tweet.user.id_str, tweet.user.screen_name,
                                    email))
                    hashes = list(iocextract.extract_hashes(text))
                    for hash in hashes:
                        output_file.write('{},{},{},{},{},hash,{}\n'.format(
                            tweet.id, tweet.created_at, user_type,
                            tweet.user.id_str, tweet.user.screen_name, hash))
                except Exception as exp:
                    print(exp)

                queue.task_done()
def main():
    # 引数の解釈の準備
    p = argparse.ArgumentParser()
    p.add_argument("domain_name")
    p.add_argument('-g', '--http', action="store_true", help="Get http response by each candidate domains")
    p.add_argument('--safe_site', default="", help="Get google safe sites tool information. must be followed by api key ")
    p.add_argument('--virustotal', default="", help="Get VirusTotal tool information. must be followed by api key. VERY SLOW ")
    p.add_argument('--ip', action="store_true", help="Get IP address for each candidate domains")
    p.add_argument('--debug', action="store_true", help="For debug. It restlicts the length of domain list.")
    # `$ dscan google.com --genlist qr typo` などとして使う
    p.add_argument('--genlist', nargs='+', help="Specify using generators as list.")
    p.add_argument('--in_use', action="store_true", help="It shows only domains in use.")
     
    args = p.parse_args()

    # URL候補を取得
    generator_dict = {}
    template_generator_names =  ["qr", "suffix", "bit", "typo", "h**o", "combo"]
    generator_names = []
    # 使うgeneratorが指定された場合
    if not args.genlist is None:
        for generator_name in args.genlist:
            if generator_name in template_generator_names:
                generator_names.append(generator_name)
            else:
                print("error: \""+ generator_name +"\" is not generator name.", file=sys.stderr)
    else:
        generator_names = template_generator_names


    for generator_name in generator_names:
        print_progress("generating "+ generator_name  +" ...")
        list_slice = ""
        if args.debug:
           # in debug mode, length of domain list is restricted
           list_slice = "[:1]"        
        generator_dict[generator_name]     = eval(generator_name +".near_urls(args.domain_name)" + list_slice)
        print_progress("generated: " + str(len(generator_dict[generator_name])))

    print_progress("fetching domain info ...")

    # 辞書形式でドメインの情報らを持つ
    domains_dict = {}
    for generate_type_name, domain_list in generator_dict.items():
        for domain_name in domain_list:
            if domain_name not in domains_dict:
                domains_dict[domain_name] = {}
                # 冗長だがあとでjsonに変換するときに必要
                domains_dict[domain_name]["domain_name"] = domain_name
            
            if "generate_type" not in domains_dict[domain_name] :
                domains_dict[domain_name]["generate_type"] = []
            
            domains_dict[domain_name]["generate_type"].append(generate_type_name)

    # ドメインに関する情報を調べ、記録していく
    for domain_name, domain_info_dict in tqdm( domains_dict.items() ):            
            # httpレスポンス情報を付加する
            if args.http:
                http_status_code = 0
                try:
                    # 200番台のステータスコードを取得
                    http_status_code = urllib.request.urlopen("http://" + domain_name,
                                                              timeout=0.5).status
                except urllib.error.HTTPError as e:
                    # 200番台以外のステータスコードを取得
                    http_status_code = e.code
                # connection refusedなどになった場合。後でもっとうまく変えたほうがよいかも
                except urllib.error.URLError as e:
                    http_status_code = -1
                except socket.timeout:
                    http_status_code = -1
                except ConnectionResetError:
                    http_status_code = -1
                domain_info_dict["http_status_code"] = http_status_code

            # Google Safe Brawsingの情報を取得
            if len(args.safe_site)>0:
                api_key_gsb = args.safe_site
                sbl = SafeBrowsingList(api_key_gsb)
                threat_list = sbl.lookup_url(domain_name)
                if threat_list == None:
                    domain_info_dict["site_threat"] = []
                else: 
                    domain_info_dict["site_threat"] = threat_list

            # VirusTotalの情報を取得
            if len(args.virustotal)>0:
                api_key_vt = args.virustotal

                # TODO:関数とかに後でする
                interval_seconds_virustotal = 60/4
                retry_max_time = 2
                retry_sleep_seconds_virustotal = 1
                for _ in range(retry_max_time):
                    try:
                        info_virustotal = fetch_pdns_domain_info(domain_name, api_key_vt)
                    except:
                        # virustotalがrate limitなどなどで取得に失敗した場合はすこし待つ
                        time.sleep(retry_sleep_seconds_virustotal)
                    else:
                        try:
                            domain_info_dict["virus_total"] = info_virustotal["Webutation domain info"]
                        except KeyError:
                            domain_info_dict["virus_total"] = {}
                        # virustotalのrate limitにかからないように60/4 = 15秒ほど寝る
                        # 制限は1分間に4クエリなのだから、1クエリにつき15秒まつのではなく、4クエリ投げたら1分待つ方が正当だが面倒なのでこうした
                        time.sleep(interval_seconds_virustotal)
                        break

            if args.ip:
                try:
                    # 生成したドメインの IP アドレスを取得
                    ip = socket.gethostbyname(domain_name)
                except socket.gaierror:
                    ip = ''
                finally:
                    domain_info_dict["ip"] = ip

            # 追加例:
            # geoip情報を付加する
            # if args.geoip:
            #     domain_info_dict["geoip"] = country
    
    if args.in_use:
        domains_dict = domain_filter_only_in_use(domains_dict)    
   
    print_list = []
    for domain_info_dict in domains_dict.values():
        print_list.append(domain_info_dict)

    print(json.dumps(print_list, indent=4, separators=(',', ': ')) )
Ejemplo n.º 23
0
 def updateCache():
     sbl = SafeBrowsingList(GOOGLE_SAFEBROWSE_API_KEY ,  db_path="/opt/crawler/gsb_v3.db")
     sbl.update_hash_prefix_cache()
Ejemplo n.º 24
0
 def testLongURL(long_url):
     sbl = SafeBrowsingList(GOOGLE_SAFEBROWSE_API_KEY, db_path="/opt/crawler/gsb_v3.db")
     return sbl.lookup_url(long_url)
# AIzaSyDWcNFNEsDWagvfgjZGHw0Y9LNvtgz3LhE Throwaway key (doesn't matter)

from gglsbl import SafeBrowsingList
 
sbl = SafeBrowsingList('AIzaSyDWcNFNEsDWagvfgjZGHw0Y9LNvtgz3LhE')

threat_list = sbl.lookup_url('http://github.com/')
if threat_list == None:
	print('None.')
else:
	print('Threats: ' + str(threat_list))
Ejemplo n.º 26
0
        logfile.write(str(getTS()) + "\n" + text + "\n")


# begin main while loop
while True:
    try:
        con = mdb.connect(config['mysql']['host'],
                          config['mysql']['username'],
                          config['mysql']['password'],
                          config['mysql']['database_5'],
                          charset='utf8',
                          cursorclass=mdb.cursors.SSCursor)
        #cursor = con.cursor()
        con.autocommit(True)

        sbl = SafeBrowsingList(config['gsb-api']['key'])

        # start timer
        mainStart = time.time()
        num_urls_to_import = 3000000

        # count total number of twitter urls + print
        #print "Counting total number of Twitter URLs in db..."
        #num_twitter_urls = countTwitterURLs()
        #num_twitter_urls = 2000000
        #print "Twitter URLs: {:,}".format(num_twitter_urls)

        total_phishing = 0
        total_malware = 0

        # import twitter urls
Ejemplo n.º 27
0
def scrapper(url):
    url = url.strip()
    url = "https://www.stuffnice.com/"
    url5 = url
    if "www" in url:
        url = url.replace("www.", "")
        print(url)
    else:
        pass

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36',
        "Upgrade-Insecure-Requests": "1",
        "DNT": "1",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate"
    }

    final_report = []
    final_score = 0

    from .result_dict import result_dict

    domain = tldextract.extract(url).domain
    suffix = tldextract.extract(url).suffix
    subdomain = tldextract.extract(url).subdomain
    pattern = '<a [^>]*href=[\'|"](.*?)[\'"].*?>'
    # row 15 HTTPS test

    result = {'name': 'https_test', 'message': '', 'marks': ''}

    if "https" in url or "http" in url:
        print("if worked")

        a = url.split(":")
        a[0] = "https:"
        web = "".join(a)

        print("This is web  ", web)

        try:
            print("try of if worked")
            r = requests.get(web, headers=headers)
            url = web
            result[
                'message'] = 'Félicitations. Votre site les données transitants par votre site sont sécurisées avec un certificat SSL'
            result['marks'] = 4
        except:

            a = url.split(":")
            a[0] = "http:"
            url3 = "".join(a)

            print("try of except worked")
            r = requests.get(url3, headers=headers, verify=False)
            url = url3
            # req = urllib.request.Request(url, headers=headers)
            # r = urllib.request.urlopen(req)
            result['message'] = '''
                    Votre site ne dispose pas de certificat SSL. Les données qui y transitent peuvent donc être récupérés par des parties malveillantes. Google donne une grande importance à la sécurité des visiteurs.
                    '''
            result['marks'] = 0
            print("HTTPS didn't worked")
    else:
        print("else worked")
        try:
            url2 = 'https://' + url
            r = requests.get(url2, headers=headers)
            url = url2
            # req = urllib.request.Request(url, headers=headers)
            # r = urllib.request.urlopen(req)
            result[
                'message'] = 'Félicitations. Votre site les données transitants par votre site sont sécurisées avec un certificat SSL'
            result['marks'] = 4

        except:
            url1 = 'http://' + url
            print("from else except ", url1)
            r = requests.get(url1, headers=headers, verify=False)
            url = url1
            # req = urllib.request.Request(url, headers=headers)
            # r = urllib.request.urlopen(req)
            result['message'] = '''
                    Votre site ne dispose pas de certificat SSL. Les données qui y transitent peuvent donc être récupérés par des parties malveillantes. Google donne une grande importance à la sécurité des visiteurs.
                    '''
            result['marks'] = 0
    print(result)
    result_dict['https_test'] = result
    final_score = final_score + result['marks']

    soup = BeautifulSoup(r.text, "lxml")

    # This is for row 1 (title)
    try:
        title_content = soup.find('title').text
        title_ln = len(title_content)

        if title_ln < 70:
            result = {
                'name': 'title',
                'message':
                'Félicitations votre site dispose d’un titre avec un nombre de caractères optimale soit moins de 70 caractères',
                'title_length': title_ln,
                'title_content': title_content,
                'marks': 5
            }
            final_score = final_score + 5
            result_dict['title'] = result
        elif title_ln > 70:
            result = {
                'name': 'title',
                'message':
                'Votre titre est trop long, le nombre de caractères optimal est de 70 caractères, essayez de le raccourcir',
                'title_length': title_ln,
                'title_content': title_content,
                'marks': 2
            }
            final_score = final_score + 2
            result_dict['title'] = result
    except:
        result = {
            'name': 'title',
            'message':
            'Votre site ne dispose pas de balise meta title. La balise <title> correspond au titre de votre page web. Il s’agit d’un champ essentiel à ne pas négliger dans le cadre d’une bonne stratégie d’optimisation du référencement naturel puisqu’elle est l’un des critères les plus importants pour les moteurs de recherche (Google, Bing...)',
            'title_length': 0,
            'marks': 0
        }
        final_score = final_score + 0
        result_dict['title'] = result

        # This is for row 2 (meta @description)
        name = 'meta_description'
        length_var_name = 'meta_desc_len'
        try:
            meta_tag = soup.find("meta", {"name": "description"})
            desc_content = meta_tag['content']
            #desc_text_ln = len(desc_content)
            desc_text_ln = int(desc_text_ln)

            if desc_text_ln < 150:
                result = {
                    'name': name,
                    'message':
                    'Votre méta-description est trop courte, le nombre de caractère optimale doit être entre 150 et 250 caractères.',
                    length_var_name: desc_text_ln,
                    'desc_content': desc_content,
                    'marks': 1
                }
                final_score = final_score + result['marks']
                result_dict['meta_description'] = result
                print('try worked1')

            elif desc_text_ln > 150 and desc_text_ln < 250:
                result = {
                    'name': name,
                    'message':
                    'Félicitations votre site dispose d’une méta-description avec un nombre de caractère optimal entre 150 et 250 caractères',
                    length_var_name: desc_text_ln,
                    'desc_content': desc_content,
                    'marks': 5
                }
                final_score = final_score + result['marks']
                result_dict['meta_description'] = result
                print('try worked2')

            elif desc_text_ln > 250:
                result = {
                    'name': name,
                    'message':
                    ' Votre méta-description est trop longue, essayez de la raccourcir, le nombre optimal est entre 150 et 250 caractères, le reste risque d’être tronqué sur l’affichage du résultat sur les moteurs de recherche.',
                    length_var_name: desc_text_ln,
                    'desc_content': desc_content,
                    'marks': 2
                }
                final_score = final_score + result['marks']
                result_dict['meta_description'] = result
                print('try worked3')
        except:
            result1 = {
                'name': name,
                'message':
                'Votre site ne dispose pas de méta-description, La balise meta description manque sur votre page. Vous devez inclure cette balise afin de fournir une brève description de votre page pouvant être utilisée par les moteurs de recherche. Des méta-descriptions bien écrites et attrayantes peuvent également aider les taux de clics sur votre site dans les résultats de moteur de recherche.',
                length_var_name: 0,
                'marks': 0
            }
            final_score = final_score + result1['marks']
            result_dict['meta_description'] = result1
            print('except worked')
            # This is for row 3 (meta @keywords)
        name = 'meta_keywords'
        length_var_name = 'meta_key_len'
        try:
            meta_tag = soup.find("meta", {"name": "keywords"})
            meta_key_content_ln = len(meta_tag['content'])
            # title_ln = int(meta_key_content_ln)

            if meta_key_content_ln:
                result = {
                    'name': name,
                    'message':
                    'Bravo vous avez spécifié des meta keywords . Vos mots-clés principaux doivent apparaître dans vos méta-tags pour vous aider à identifier le sujet de votre page Web dans les moteurs de recherche.',
                    length_var_name: meta_key_content_ln,
                    'marks': 1
                }
                final_score = final_score + result['marks']
                result_dict['meta_keywords'] = result
                print('try worked1')
        except:
            result = {
                'name': name,
                'message':
                'Vos mots-clés principaux doivent apparaître dans vos méta-tags pour vous aider à identifier le sujet de votre page Web dans les moteurs de recherche.',
                length_var_name: 0,
                'marks': 0
            }
            final_score = final_score + result['marks']
            result_dict['meta_keywords'] = result
            print('except worked')
            # This is for row 4 (meta @robots)
        name = 'meta_robots'
        length_var_name = 'meta_robots_len'
        try:
            meta_tag = soup.find("meta", {"name": "robots"})
            meta_robots_content = len(meta_tag['content'])
            # title_ln = int(desc_text_ln)

            if meta_robots_content:
                result = {
                    'name': name,
                    'message': "Votre site dispose d'un fichier robots.txt",
                    length_var_name: meta_robots_content,
                    'marks': 4
                }
                final_score = final_score + result['marks']
                result_dict['meta_robots'] = result
                print('try worked1')
        except:
            result1 = {
                'name': name,
                'message': '''
                               Votre site n’a pas de robot.txt
                               Le robots.txt est un fichier texte utilisant un format précis qui permet à un Webmaster de contrôler quelles zones de son site un robot d'indexation est autorisé à analyser. Ce fichier texte sera disponible à une URL bien précise pour un site donné, par exemple http://www.monsite.com/robots.txt
                               Pour bien comprendre à quoi sert un robots.txt, il faut comprendre la manière dont fonctionnent les robots d'indexation des moteurs de recherche (appelés aussi Web spiders, Web crawlers ou Bots) tels que Google, Yahoo ou Bing. Voici leurs actions lorsqu'ils analysent un site tel que www.monsite.com : ils commencent par télécharger et analyser le fichier http://www.monsite.com/robots.txt.
                   ''',
                length_var_name: 0,
                'marks': 0
            }
            final_score = final_score + result1['marks']
            result_dict['meta_robots'] = result1
            print('except worked')
            # This is for row 5 (html lang)
        name = 'html_lang'
        length_var_name = 'html_lang'
        try:
            meta_tag = soup.find("html", {"lang": True})
            lang_text = meta_tag['lang']

            result = {
                'name': name,
                'message':
                "Félicitations. Vous avez spécifié une langue à votre page.",
                length_var_name: lang_text,
                'marks': 3
            }
            final_score = final_score + result['marks']
            result_dict['html_lang'] = result
            print('try worked1')
        except:
            result1 = {
                'name': name,
                'message': '''
                   Vous devriez spécifier une langue pour votre site, les moteurs de recherches ne comprennent pas quand un site dispose de plusieurs langues par exemple ayant des mots techniques en anglais et un contenu texte en français. Il faut donc bien spécifier la langue.
                   ''',
                length_var_name: 0,
                'marks': 0
            }
            final_score = final_score + result1['marks']
            result_dict['html_lang'] = result1
            print('except worked')

        # This is for row 6 (sitemap)
        url = url.strip()
        sitemap_url = url + '/sitemap.xml'
        print("Sitemap url ", sitemap_url)
        try:

            code = requests.get(sitemap_url, headers=headers).status_code

            name = 'sitemap'

            if code == 200:
                result = {
                    'name': name,
                    'message':
                    "Félicitations, votre site dispose d’un fichier sitemap",
                    'marks': 2
                }
                final_score = final_score + result['marks']
                result_dict['sitemap'] = result

            else:
                result = {
                    'name': name,
                    'message':
                    "Votre site Web ne dispose pas d'un fichier sitemap. Les sitemaps peuvent aider les robots à indexer votre contenu de manière plus complète et plus rapide. ",
                    'marks': 0
                }
                final_score = final_score + result['marks']
                result_dict['sitemap'] = result
        except:
            result = {
                'name': name,
                'message':
                "Votre site Web ne dispose pas d'un fichier sitemap. Les sitemaps peuvent aider les robots à indexer votre contenu de manière plus complète et plus rapide. ",
                'marks': 0
            }
            final_score = final_score + result['marks']
            result_dict['sitemap'] = result
            # This is for row 7 (google Analytics)
            searched_word = 'google-analytics'

            name = 'google_analytics'
            if searched_word in str(soup):
                print("Google analytics found")
                result = {
                    'name': name,
                    'message':
                    "Félicitations, votre site dispose de l'outil Google Analytics",
                    'marks': 2
                }
                final_score = final_score + result['marks']
                result_dict['google_analytics'] = result

            else:
                result = {
                    'name': name,
                    'message':
                    "Votre site ne dispose pas de l'outil Google Analytics.",
                    'marks': 0
                }
                final_score = final_score + result['marks']
                result_dict['google_analytics'] = result

            # This is for row 8 (page_cache)
            name = 'page_cache'
            length_var_name = 'page_cache_desc'
            try:
                meta_tag = soup.find("meta", {"http-equiv": "Cache-control"})
                lang_text = meta_tag['content']

                result = {
                    'name': name,
                    'message':
                    "Vous avez activé le cache sur votre page, c'est très bien.",
                    length_var_name: lang_text,
                    'marks': 3
                }
                final_score = final_score + result['marks']
                result_dict['page_cache'] = result
                print('try worked1')
            except:
                result1 = {
                    'name': name,
                    'message':
                    "Vous n'avez pas activé la mise en cache sur vos pages. La mise en cache permet un chargement plus rapide des pages.",
                    length_var_name: 0,
                    'marks': 0
                }
                final_score = final_score + result1['marks']
                result_dict['page_cache'] = result1
                print('except worked')
                # API_KEY = AIzaSyD_RLUOcTN1JAq8PL8zJ79X6-kmHIDy_uM
                # This is for row 9 (Google safe browsing api)

            api_key = 'AIzaSyCVylpWnsOwzUoeTGg7akZRod-4YbhXoPU'
            sbl = SafeBrowsingList(api_key)
            bl = sbl.lookup_url(url)

            name = 'google_safe_browsing'
            print("google_safe_browsing ", url)
            if bl is None:
                print("Website is safe")
                result = {
                    'name': name,
                    'message': "Votre site est considéré comme sécurisé.",
                    'marks': 2
                }
                final_score = final_score + result['marks']
                result_dict['google_safe_browsing'] = result

            else:
                result = {
                    'name': name,
                    'message':
                    "Votre site n'est pas considéré comme sécurisé. Google et les autres moteurs de recherche prennent en compte le niveau de sécurité de votre site pour garantir la sécurité des visiteurs.",
                    'marks': 0,
                    'threats': bl
                }
                final_score = final_score + result['marks']
                result_dict['google_safe_browsing'] = result

            # This is for row 10 (responsive website test)
            #name = 'responsive_test'
            #length_var_name = 'responsive_test_desc'
            try:
                meta_tag = soup.find("meta", {"name": "viewport"})
                lang_text = meta_tag['content']

                result = {
                    'name': name,
                    'message': "Félicitations. Votre site est responsive.",
                    length_var_name: lang_text,
                    'marks': 4
                }
                final_score = final_score + result['marks']
                result_dict['responsive_test'] = result
                print('try worked1')
            except:
                result1 = {
                    'name': name,
                    'message': '''
                       Nous n'avons pas détécté que votre site internet était responsive, soit adapté au mobile. Google prend énormément en compte ce critère pour un bon référencement.
                       ''',
                    length_var_name: 0,
                    'marks': 0
                }
                final_score = final_score + result1['marks']
                result_dict['responsive_test'] = result1
                print('except worked')
Ejemplo n.º 28
0
def main():
    # 引数の解釈の準備
    p = argparse.ArgumentParser()
    p.add_argument("domain_name")
    p.add_argument('-g',
                   '--http',
                   action="store_true",
                   help="Get http response by each candidate domains")
    p.add_argument(
        '--safe_site',
        default="",
        help=
        "Get google safe sites tool information. must be followed by api key ")
    p.add_argument(
        '--virustotal',
        default="",
        help=
        "Get google safe sites tool information. must be followed by api key. VERY SLOW "
    )
    args = p.parse_args()

    # URL候補を取得
    generator_dict = {}
    # TODO: 練習用にリストの長さを制限しているが、本番のときは制限をなくす
    # TODO: 同じような内容が反復されているので上手くまとめる(呼ぶ関数が違うので単純にforにはしにくいけど)
    print_progress("generating qr ...")
    generator_dict["qr"] = qr.near_urls(args.domain_name)[:1]
    print_progress("generated: " + str(len(generator_dict["qr"])))
    print_progress("generating suffix ...")
    generator_dict["suffix"] = suffix.generate_domain(args.domain_name)[:1]
    print_progress("generated: " + str(len(generator_dict["suffix"])))
    print_progress("generating bit ...")
    generator_dict["bit"] = bit.near_urls(args.domain_name)[:1]
    print_progress("generated: " + str(len(generator_dict["bit"])))
    print_progress("generating typo ...")
    generator_dict["typo"] = typo.near_urls(args.domain_name)[:1]
    print_progress("generated: " + str(len(generator_dict["typo"])))
    #domains_dict["h**o"]   = h**o.near_urls(domain)
    #domains_dict["combo"]  = combo.near_urls(domain)

    print_progress("fetching domain info ...")

    # 辞書形式でドメインの情報らを持つ
    domains_dict = {}
    for generate_type_name, domain_list in generator_dict.items():
        for domain_name in domain_list:
            if domain_name not in domains_dict:
                domains_dict[domain_name] = {}
                # 冗長だがあとでjsonに変換するときに必要
                domains_dict[domain_name]["domain_name"] = domain_name

            if "generate_type" not in domains_dict[domain_name]:
                domains_dict[domain_name]["generate_type"] = []

            domains_dict[domain_name]["generate_type"].append(
                generate_type_name)

    # ドメインに関する情報を調べ、記録していく
    for domain_name, domain_info_dict in tqdm(domains_dict.items()):
        # httpレスポンス情報を付加する
        if args.http:
            # TODO: httpステータスコードの取得をもっとマシなものにする
            # https://stackoverflow.com/questions/1726402/in-python-how-do-i-use-urllib-to-see-if-a-website-is-404-or-200
            http_status_code = 0
            try:
                urllib.request.urlopen("http://" + domain_name, timeout=0.5)
            except urllib.error.HTTPError as e:
                http_status_code = e.code
            # connection refusedなどになった場合。後でもっとうまく変えたほうがよいかも
            except urllib.error.URLError as e:
                http_status_code = -1
            except socket.timeout:
                http_status_code = -1
            except ConnectionResetError:
                http_status_code = -1
            else:
                # エラーにならないのは本当に200だけか...?301とかもあるかもしれないがとりあえず200
                http_status_code = 200
            domain_info_dict["http_status_code"] = http_status_code

        # Google Safe Brawsingの情報を取得
        if len(args.safe_site) > 0:
            api_key_gsb = args.safe_site
            sbl = SafeBrowsingList(api_key_gsb)
            threat_list = sbl.lookup_url(domain_name)
            if threat_list == None:
                domain_info_dict["site_threat"] = []
            else:
                domain_info_dict["site_threat"] = threat_list

        # VirusTotalの情報を取得
        if len(args.virustotal) > 0:
            api_key_vt = args.virustotal

            # TODO:関数とかに後でする
            interval_seconds_virustotal = 60 / 4
            retry_max_time = 2
            retry_sleep_seconds_virustotal = 1
            for _ in range(retry_max_time):
                try:
                    info_virustotal = fetch_pdns_domain_info(
                        domain_name, api_key_vt)
                except:
                    # virustotalがrate limitなどなどで取得に失敗した場合はすこし待つ
                    time.sleep(retry_sleep_seconds_virustotal)
                else:
                    domain_info_dict["virus_total"] = info_virustotal[
                        "Webutation domain info"]
                    # virustotalのrate limitにかからないように60/4 = 15秒ほど寝る
                    # 制限は1分間に4クエリなのだから、1クエリにつき15秒まつのではなく、4クエリ投げたら1分待つ方が正当だが面倒なのでこうした
                    time.sleep(interval_seconds_virustotal)
                    break

        # 追加例:
        # geoip情報を付加する
        # if args.geoip:
        #     domain_info_dict["geoip"] = country

    print_list = []
    for domain_info_dict in domains_dict.values():
        print_list.append(domain_info_dict)

    print(json.dumps(print_list, indent=4, separators=(',', ': ')))
Ejemplo n.º 29
0
        length_var_name: 0,
        'marks': 0
    }
    final_score = final_score + result1['marks']
    final_report.append(result1)
    print('except worked')

# In[63]:

# API_KEY = AIzaSyD_RLUOcTN1JAq8PL8zJ79X6-kmHIDy_uM
# This is for row 9 (Google safe browsing api)

from gglsbl import SafeBrowsingList

api_key = 'AIzaSyCVylpWnsOwzUoeTGg7akZRod-4YbhXoPU'
sbl = SafeBrowsingList(api_key)
bl = sbl.lookup_url(url)

name = 'google_safe_browsing'
if bl is None:
    print("Website is safe")
    result = {
        'name': name,
        'message': "Votre site est considéré comme sécurisé.",
        'marks': 2
    }
    final_score = final_score + result['marks']
    final_report.append(result)

else:
    result = {
Ejemplo n.º 30
0
except:
  print("You need to 'pip install -r requirements.txt'")
  sys.exit(1)

safebrowsing_token = 'AIzaSyBKlevd7lUJpEq0XGnvaojrmS9OJqWY6YA'
isc_url = 'https://isc.sans.edu/feeds/suspiciousdomains_Low.txt'
topmillion_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'

safebrowsing_db = os.environ['HOME'] + '/Downloads/safebrowsing.db'
suspect_file = os.environ['HOME'] + '/Downloads/suspiciousdomains_Low.txt'
topthousand_file = os.environ['HOME'] + '/Downloads/alexa_1000.csv'

safebrowsing_bootstrap = not os.path.exists(safebrowsing_db) or (os.path.getsize(safebrowsing_db) < (1024*1024))

# Be sure to occasionally run sbl.update_hash_prefix_cache()
sbl = SafeBrowsingList(safebrowsing_token, db_path=safebrowsing_db)

ISC_LIST=[]
ALEXA_LIST=[]

def loadLists(writer=sys.stdout):
  if isStale(suspect_file):
    print >> writer, "Updating ISC Suspicious Domains..."
    new_file = requests.get(isc_url)
    with open(suspect_file, 'w') as sf_buffer:
      sf_buffer.write(new_file.content)

  if safebrowsing_bootstrap:
      print("Initial download of SafeBrowsing DB... this will take a few minutes.")
      updateSafebrowsing()
  elif isStale(safebrowsing_db, maxTime=259200):
Ejemplo n.º 31
0
def lookup_db():
    from gglsbl import SafeBrowsingList
    from gglsbl.protocol import URL
    from gglsbl.utils import to_hex

    res = {}
    rdict = {
        'status':'',
        'message':'',
    }

    key = ''
    db = '../gsb_v4.db'
    platforms = ['WINDOWS']
    if os.path.isfile(config):
        cp = ConfigParser()
        cp.read(config)
        if 'api' in cp:
            if 'key' in cp['api']:
                key = cp['api']['key']
        if 'database' in cp:
            if 'localdb' in cp['database']:
                db = cp['database']['localdb']
    if not key:
        logging.error('API key not found.')
        rdict['status'] = 500
        rdict['message'] = 'Internal Server Error'

    url = ''
    update = False
    if request.method == 'GET':
        url = request.args.get('url')
        update = request.args.get('update')
    if not url:
        rdict['status'] = 400
        rdict['message'] = "The parameter 'url' is missing"

    if not rdict['status']:
        sbl = SafeBrowsingList(key, db_path=db, platforms=platforms)
        logging.debug(sbl.storage.get_threat_lists())
        #if update:
        #    sbl.update_hash_prefix_cache()
        u = URL(url)
        #res['url'] = {
        res = {
            'query': u.url,
            'canonical': u.canonical,
            'permutations': [],
        }
        for i in u.url_permutations(u.canonical):
            p = {
                'pattern': i,
                'sha256': to_hex(u.digest(i))
            }
            #res['url']['permutations'].append(p)
            res['permutations'].append(p)
            
        url_hashes = u.hashes
        full_hashes = list(url_hashes)
        cues = [to_hex(fh[0:4]) for fh in full_hashes]
        #res['cues'] = cues
        res['results'] = []
        matched = sbl.storage.lookup_hash_prefix(cues)
        for m in matched:
            prefix = to_hex(m[1])
            for p in res['permutations']:
                if re.match(prefix, p['sha256']):
                    result = {
                        'pattern': p['pattern'],
                        #'prefix': to_hex(m[1]),
                        'prefix': prefix,
                        'matched': str(m[0]),
                    }
                    res['results'].append(result)
        #bl = sbl.lookup_url(url)
        #res['matched'] = bl
        logging.info(res)
        res = jsonify(res)

    if not res:
        if not rdict["status"]:
            rdict["status"] = 400
            rdict["message"] = "Invalid request."
        res = jsonify(rdict)
        res.status_code = rdict["status"]

    return res 
Ejemplo n.º 32
0
with open('config.json') as data_file:    
	config = json.load(data_file)

# begin main while loop
while True:

	mainStart = time.time()

	
	#update GSB dataset
	start = time.time()
	print "Updating local GSB dataset..."
	print datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	try:

		sbl = SafeBrowsingList(config['gsb-api']['key'])	

		con = mdb.connect(config['mysql']['host'], config['mysql']['username'], config['mysql']['password'], config['mysql']['database_5'], charset='utf8')
		con.autocommit(True)
		cur = con.cursor()

		sbl.update_hash_prefix_cache()
		#hash_prefixes = sbl.get_all_hash_prefixes() #from my modified version
		
		gglsbl_db = "/tmp/gsb_v4.db"
		sql_db = sqlite3.connect(gglsbl_db)
		cursor = sql_db.cursor()
		cursor.execute('''SELECT HEX(value) from hash_prefix''') #get all hash prefixes
		#cursor.execute('''SELECT value from full_hash''') #get all full hashes
		all_rows = cursor.fetchall()
		
Ejemplo n.º 33
0
class URLMonitor(Plugin):

    blacklist = []
    moderators = []
    sbl = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.moderators = self.plugin_config['MODERATORS']

        # Initialize Safe Browsing API
        if self.plugin_config['GOOGLE_SAFE_BROWSING']:
            self.sbl = SafeBrowsingList(
                self.plugin_config['GOOGLE_SAFE_BROWSING_API_KEY'])
            self.sbl.update_hash_prefix_cache()

        # Populate Blacklist from URLS
        for url in self.plugin_config['BLACKLISTS']:
            url = url.strip()
            if url.endswith('.json'):
                r = requests.get(url)
                # Assuming MEW List format
                for item in r.json():
                    self.blacklist.append(item['id'])

            elif url.endswidth('.csv'):
                print('csv not implemented')  # TODO
            else:
                print('txt not implement')  # TODO

        print(self.__class__.__name__, 'initialized')

    def process_message(self, data):
        # print(data)
        # Private (Groups) or Public Channels
        if chan.startswith('C') or chan.startswith('G'):
            chan = data['channel']
            text = data['text']

            # Find all URLS in message text, extract host and compare against blacklist and Google Safebrowsing
            urls = re.findall(
                'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                text)

            def alert(url):
                # TODO flag user
                # TODO early warning system
                self.slack_client.api_call(
                    'chat.postMessage',
                    channel=self.plugin_config['MODERATE_CHAN'],
                    ' '.join(self.moderators) + ' ' +
                    text)  # TODO can probably use outputs for this
                if len(self.plugin_config.WARNING_MESSAGE):
                    self.outputs.append(
                        [data['channel'], self.plugin_config.WARNING_MESSAGE])

            for u in urls:
                o = urlparse(u)
                host = re.split(":\d{,4}", o.netloc)[0]

                # Check Blacklist
                if host in self.blacklist:
                    alert(u)
                    break
                # Check Google Safebrowsing
                elif sbl.lookup_url(u):
                    alert(u)
                    break