Example #1
0
                    asn = getattr(l, 'asn')

                    # EU is not an official ISO 3166 code (but used by RIPE
                    # IP allocation)
                    if cc is not None and cc != "EU":
                        print hostl, asn, cc, \
                            pycountry.countries.get(alpha2=cc).name
                        if cc == cc_critical:
                            publisher.warning(
                                'Url;{};{};{};Detected {} {}'.format(
                                    PST.p_source, PST.p_date, PST.p_name,
                                    hostl, cc))
                    else:
                        print hostl, asn, cc

                A_values = lib_refine.checking_A_record(r_serv2,
                                                        domains_list)
                if A_values[0] >= 1:
                    PST.__setattr__(channel, A_values)
                    PST.save_attribute_redis(channel, (A_values[0],
                                             list(A_values[1])))

                    pprint.pprint(A_values)
                    publisher.info('Url;{};{};{};Checked {} URL'.format(
                        PST.p_source, PST.p_date, PST.p_name, A_values[0]))
            prec_filename = filename

        else:
            publisher.debug("Script url is Idling 10s")
            print 'Sleeping'
            time.sleep(10)
Example #2
0
def main():
    """Main Function"""

    # CONFIG #
    cfg = ConfigParser.ConfigParser()
    cfg.read(configfile)

    # REDIS #
    r_serv = redis.StrictRedis(
        host = cfg.get("Redis_Queues", "host"),
        port = cfg.getint("Redis_Queues", "port"),
        db = cfg.getint("Redis_Queues", "db"))

    r_serv1 = redis.StrictRedis(
        host = cfg.get("Redis_Data_Merging", "host"),
        port = cfg.getint("Redis_Data_Merging", "port"),
        db = cfg.getint("Redis_Data_Merging", "db"))

    r_serv2 = redis.StrictRedis(
        host = cfg.get("Redis_Cache", "host"),
        port = cfg.getint("Redis_Cache", "port"),
        db = cfg.getint("Redis_Cache", "db"))


    # LOGGING #
    publisher.channel = "Script"

    # ZMQ #
    #Subscriber
    subscriber_name = "urls"
    subscriber_config_section = "PubSub_Categ"

    #Publisher
    publisher_config_section = "PubSub_Url"
    publisher_name = "adress"
    pubchannel = cfg.get("PubSub_Url", "channel")

    Sub = ZMQ_PubSub.ZMQSub(configfile, subscriber_config_section, "web_categ", subscriber_name)
    Pub = ZMQ_PubSub.ZMQPub(configfile, publisher_config_section, publisher_name)

    #Sub = ZMQ_PubSub.ZMQSub(configfile, "PubSub_Categ", "web_categ", "urls")

    # FUNCTIONS #
    publisher.info("Script URL subscribed to channel web_categ")

    message = Sub.get_msg_from_queue(r_serv)
    prec_filename = None

    url_regex = "(http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*"

    while True:
        try:
            if message != None:
                channel, filename, word, score  = message.split()

                if prec_filename == None or filename != prec_filename:
                    domains_list = []
                    PST = P.Paste(filename)

                    for x in PST.get_regex(url_regex):
                        scheme, credential, subdomain, domain, host, tld, port, resource_path, query_string, f1, f2, f3, f4 = x
                        domains_list.append(domain)
                        msg = pubchannel + " " + str(x)
                        Pub.send_message(msg)
                        publisher.debug('{0} Published'.format(x))

                        if f1 == "onion":
                            print domain

                    A_values = lib_refine.checking_A_record(r_serv2, domains_list)

                    if A_values[0] >= 1:
                        PST.__setattr__(channel, A_values)
                        PST.save_attribute_redis(r_serv1, channel, (A_values[0],list(A_values[1])))

                        pprint.pprint(A_values)
                        publisher.info('{0};{1};{2};{3};{4}'.format("Url", PST.p_source, PST.p_date, PST.p_name, "Checked "+str(A_values[0])+" URL" ))
                prec_filename = filename

            else:
                if r_serv.sismember("SHUTDOWN_FLAGS", "Urls"):
                    r_serv.srem("SHUTDOWN_FLAGS", "Urls")
                    print "Shutdown Flag Up: Terminating"
                    publisher.warning("Shutdown Flag Up: Terminating.")
                    break
                publisher.debug("Script url is Idling 10s")
                time.sleep(10)

            message = Sub.get_msg_from_queue(r_serv)
        except dns.exception.Timeout:
            print "dns.exception.Timeout"
            pass
Example #3
0
                    # EU is not an official ISO 3166 code (but used by RIPE
                    # IP allocation)
                    if cc is not None and cc != "EU":
                        print hostl, asn, cc, \
                            pycountry.countries.get(alpha2=cc).name
                        if cc == cc_critical:
                            to_print = 'Url;{};{};{};Detected {} {}'.format(
                                    PST.p_source, PST.p_date, PST.p_name,
                                    hostl, cc)
                            #publisher.warning(to_print)
                            print to_print
                    else:
                        print hostl, asn, cc

                A_values = lib_refine.checking_A_record(r_serv2,
                                                        domains_list)
                if A_values[0] >= 1:
                    PST.__setattr__(channel, A_values)
                    PST.save_attribute_redis(channel, (A_values[0],
                                             list(A_values[1])))

                    pprint.pprint(A_values)
                    publisher.info('Url;{};{};{};Checked {} URL'.format(
                        PST.p_source, PST.p_date, PST.p_name, A_values[0]))
            prec_filename = filename

        else:
            publisher.debug("Script url is Idling 10s")
            print 'Sleeping'
            time.sleep(10)
Example #4
0
    def compute(self, message):
        """
        Search for Web links from given message
        """
        # Extract item
        filename, score = message.split()

        if self.prec_filename is None or filename != self.prec_filename:
            domains_list = set()
            PST = Paste.Paste(filename)
            client = ip2asn()

            detected_urls = PST.get_regex(self.url_regex)
            if len(detected_urls) > 0:
                to_print = 'Web;{};{};{};'.format(
                    PST.p_source, PST.p_date, PST.p_name)
                self.redis_logger.info('{}Detected {} URL;{}'.format(
                    to_print, len(detected_urls), PST.p_rel_path))

            for url in detected_urls:
                self.redis_logger.debug("match regex: %s" % (url))

                # self.redis_logger.debug("match regex search: %s"%(url))

                to_send = "{} {} {}".format(url, PST._get_p_date(), filename)
                self.process.populate_set_out(to_send, 'Url')
                self.redis_logger.debug("url_parsed: %s" % (to_send))

                self.faup.decode(url)
                domain = self.faup.get_domain()
                subdomain = self.faup.get_subdomain()

                self.redis_logger.debug('{} Published'.format(url))

                if subdomain is not None:
                    # TODO: # FIXME: remove me
                    try:
                        subdomain = subdomain.decode()
                    except:
                        pass

                if domain is not None:
                    # TODO: # FIXME: remove me
                    try:
                        domain = domain.decode()
                    except:
                        pass
                    domains_list.add(domain)

                hostl = self.avoidNone(subdomain) + self.avoidNone(domain)

                try:
                    socket.setdefaulttimeout(1)
                    ip = socket.gethostbyname(hostl)
                    # If the resolver is not giving any IPv4 address,
                    # ASN/CC lookup is skip.
                    l = client.lookup(ip, qType='IP')
                except ipaddress.AddressValueError:
                    self.redis_logger.debug(
                        f'ASN/CC lookup failed for IP {ip}')
                    continue
                except:
                    self.redis_logger.debug(
                        f'Resolver IPv4 address failed for host {hostl}')
                    continue

                cc = getattr(l, 'cc')
                asn = ''
                if getattr(l, 'asn') is not None:
                    asn = getattr(l, 'asn')[2:]  # remobe b'

                # EU is not an official ISO 3166 code (but used by RIPE
                # IP allocation)
                if cc is not None and cc != "EU":
                    self.redis_logger.debug('{};{};{};{}'.format(hostl, asn, cc,
                                                                 pycountry.countries.get(alpha_2=cc).name))
                    if cc == self.cc_critical:
                        to_print = 'Url;{};{};{};Detected {} {}'.format(
                            PST.p_source, PST.p_date, PST.p_name,
                            hostl, cc)
                        self.redis_logger.info(to_print)
                else:
                    self.redis_logger.debug('{};{};{}'.format(hostl, asn, cc))

            A_values = lib_refine.checking_A_record(self.r_serv2,
                                                    domains_list)

            if A_values[0] >= 1:

                pprint.pprint(A_values)
                # self.redis_logger.info('Url;{};{};{};Checked {} URL;{}'.format(
                #     PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_rel_path))

        self.prec_filename = filename