コード例 #1
0
class Task(BaseReq):
    def __init__(self, name, time=None):
        super().__init__()
        self.logger = ContextLogger('sync_sqlite')
        self.name = name
        self.time = time

    def start(self):
        self.logger.info("Task Started: {}".format(self.name))

    def exit(self):
        sys.exit()

    def process(self):
        pass

    def finish(self):
        self.logger.info("Task Finished: {}".format(self.name))

    def safe_commit(self, value):
        try:
            db.session.add(value)
            db.session.commit()
            self.logger.info('同步成功')
        except Exception as e:
            self.logger.warning(e)
            db.session.rollback()

    def run(self):
        self.start()
        self.process()
        self.finish()
コード例 #2
0
class BaseReq:
    def __init__(self, is_crawl=True):
        self.ses = db.session
        self.is_crawl = is_crawl
        self.logger = ContextLogger('crawl')

    def _request(self, url, method='post', timeout=20, retry=5, **kwargs):
        if kwargs.get('headers'):
            headers = kwargs['headers']
        else:
            headers = {}
        if kwargs.get('cookies'):
            cookies = kwargs['cookies']
        else:
            cookies = {}

        try:
            resp = requests.request(method,
                                    '{}'.format(url),
                                    timeout=timeout,
                                    headers=headers,
                                    cookies=cookies,
                                    **kwargs)
        except Exception as e:
            self.logger.warning(e)
            if retry > 0:
                return self._request(url,
                                     method,
                                     timeout,
                                     retry=retry - 1,
                                     **kwargs)
            else:
                return None
        if resp.status_code != 200 and retry > 0:
            return self._request(url,
                                 method,
                                 timeout,
                                 retry=retry - 1,
                                 **kwargs)
        if self.is_crawl:
            return resp.text
        else:
            try:
                data = resp.json()
            except Exception as e:
                self.logger.warning(e)
                data = None
            return data

    def get(self, url, **kwargs):
        return self._request(url, method='get', **kwargs)

    def post(self, url, **kwargs):
        return self._request(url, method='post', **kwargs)
コード例 #3
0
class Task(BaseReq):
    def __init__(self, name, time=None):
        super().__init__()
        self.logger = ContextLogger('task')
        self.name = name
        self.time = time
        self.headers = {
            'Upgrade - Insecure - Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Referer': 'http://www.baidu.com',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
        }
        self.cookies = {}

    def start(self):
        self.logger.info("Task Started: {}".format(self.name))

    def exit(self):
        sys.exit()

    def finish(self):
        self.logger.info("Task Finished: {}".format(self.name))

    def run(self):
        self.start()
        self.run_crawl()
        self.finish()

    def safe_commit(self, value):
        try:
            db.session.add(value)
            db.session.commit()
            self.logger.info(value.title + '提交成功')
        except Exception as e:
            self.logger.warning(e)
            db.session.rollback()
コード例 #4
0
class CrawlBambenekconsulting(Task):
    def __init__(self):
        super().__init__('bambenekconsulting 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []

        url = 'http://osint.bambenekconsulting.com/feeds/c2-dommasterlist-high.txt'
        source = 'osint.bambenekconsulting.com'
        _info = self.get(url=url)
        if _info is None:
            self.logger.warning("request returned None   "+source)
            return None
        info = _info.split('\n')
        for _domain in info:
            if _domain.startswith('#'): continue
            if _domain == '': continue
            domain = _domain.split(',')[0]
            description = re.search('Domain used by (.*)', _domain.split(',')[1]).group(1)
            updatetime = _domain.split(',')[2].split(' ')[0]
            block = [domain, updatetime, description, source]
            domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):

        start = time.time()
        all_count = len(domains)
        avail_count = 0

        if len(domains) > 0:
            try:
                for domain, updatetime, description, source in domains:
                        flag = db.session.query(Domain).filter(Domain.domain == domain, Domain.source == source).first()

                        if flag is None:
                            new_domain = Domain()
                            new_domain.domain = domain
                            new_domain.updatetime = updatetime
                            new_domain.description = description
                            new_domain.source = source
                            db.session.add(new_domain)
                            avail_count += 1
                        else:
                            flag.updatetime = updatetime
                            db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("bambenekconsulting 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("bambenekconsulting 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
コード例 #5
0
class CrawlBadips(Task):
    def __init__(self):
        super().__init__('badips 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'https://www.badips.com/get/list/any/2?age=7d'
        source = 'www.badips.com'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for _ip in info:
            if _ip.startswith('#'): continue
            if _ip == '': continue
            ip = _ip.strip()
            block = [ip, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, source in ips:
                    flag = db.session.query(IP).filter(
                        IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("badips 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("badips 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
コード例 #6
0
class CrawlMaxmind(Task):
    def __init__(self):
        super().__init__('maxmind 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'https://www.maxmind.com/en/high-risk-ip-sample-list'
        source = 'www.maxmind.com'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        _ips = re.findall(r'[0-9]+(?:\.[0-9]+){3}', _info)
        description = 'Higher Risk IP'
        for ip in _ips:
            block = [ip, description, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, description, source in ips:
                    flag = db.session.query(IP).filter(
                        IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.description = description
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("maxmind 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("maxmind  抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
コード例 #7
0
ファイル: base_req.py プロジェクト: huang-zp/Analyse_Job
class BaseReq:
    def __init__(self, is_crawl=True):
        self.ses = db.session
        self.is_crawl = is_crawl
        self.logger = ContextLogger('crawl')
        self.headers = {
            'Host': 'www.lagou.com',
            'Upgrade - Insecure - Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Referer': 'https://www.lagou.com/jobs/list_Python?px=new&city=%E5%85%A8%E5%9B%BD',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Content-Type': 'multipart/form-data'
        }

    def _request(self, url, method='post', timeout=20, retry=5, **kwargs):

        if kwargs.get('cookies'):
            cookies = kwargs['cookies']
        else:
            cookies = {}
        try:
            resp = requests.request(method, '{}'.format(url), timeout=timeout, headers=self.headers,
                                    cookies=cookies, **kwargs)
        except Exception as e:
            if retry > 0:
                return self._request(url, method, timeout, retry=retry-1, **kwargs)
            else:
                self.logger.warning('请求失败', e)
                return None
        if resp.status_code != 200 and retry > 0:
            return self._request(url, method, timeout, retry=retry-1, **kwargs)
        if self.is_crawl:

            return resp.text
        else:
            try:
                data = resp.json()
                if (data['success'] is False or data is None) and retry > 0:
                    time.sleep(3)
                    return self._request(url, method, timeout, retry=retry - 1, **kwargs)
            except Exception as e:
                if retry > 0:
                    return self._request(url, method, timeout, retry=retry - 1, **kwargs)
                else:
                    self.logger.warning('解析失败', e)
                    self.logger.warning(url)
                    data = None
            return data

    def get(self, url, **kwargs):
        return self._request(url, method='get', **kwargs)

    def post(self, url, **kwargs):
        return self._request(url, method='post', **kwargs)
コード例 #8
0
class CrawlMalwaredomains(Task):
    def __init__(self):
        super().__init__('malwaredomains 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []
        url = 'http://mirror4.malwaredomains.com/files/justdomains'
        source = 'mirror4.malwaredomains.com'
        _info = self.get(url=url)
        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for domain in info:
            block = [domain, source]
            domains.append(block)

        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):

        start = time.time()
        all_count = len(domains)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(domains) > 0:
            try:
                for domain, source in domains:
                    flag = db.session.query(Domain).filter(
                        Domain.domain == domain,
                        Domain.source == source).first()

                    if flag is None:
                        new_domain = Domain()
                        new_domain.domain = domain
                        new_domain.source = source
                        db.session.add(new_domain)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("malwaredomains 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("malwaredomains 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
コード例 #9
0
class CrawlGithubusercontent(Task):
    def __init__(self):
        super().__init__('githubusercontent 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        urls = ['https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/botscout_1d.ipset',
                'https://raw.githubusercontent.com/firehol/blocklist-ipsets/master/cruzit_web_attacks.ipset'
                ]
        source = 'raw.githubusercontent.com'
        for url in urls:
            _info = self.get(url=url)
            if _info is None:
                self.logger.warning("request returned None   " + source)
                return None
            info = _info.split('\n')
            for _ip in info:
                if _ip.startswith('#'): continue
                if _ip == '': continue
                ip = _ip.strip()
                block = [ip, source]
                ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, source in ips:
                    flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("githubusercontent 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("githubusercontent 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
コード例 #10
0
class Crawlemergingthreats(Task):
    def __init__(self):
        super().__init__('emergingthreats 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'http://rules.emergingthreats.net/open/suricata/rules/botcc.rules'
        source = 'rules.emergingthreats.net'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   "+source)
            return None
        blocks = re.findall(r'(\[.*\))', _info)
        for block in blocks:
            _ips = re.findall(r'[0-9]+(?:\.[0-9]+){3}', block)
            description = re.search('classtype:(.*?);', block).group(1)
            for ip in _ips:
                block = [ip, description, source]
                ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, description, source in ips:
                    flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.description = description
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("emergingthreats 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("emergingthreats 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
コード例 #11
0
class CrawlCybersweat(Task):
    def __init__(self):
        super().__init__('cybersweat 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'https://cybersweat.shop/iprep/iprep_ramnode.txt' # ERROR
        source = 'cybersweat.shop'
        proxies = {'http': 'socks5://127.0.0.1:1080', 'https': 'socks5://127.0.0.1:1080'}
        _info = self.get(url=url, proxies=proxies)

        if _info is None:
            self.logger.warning("request returned None   "+source)
            return None
        info = _info.split('\n')
        for _ip in info:
            if _ip.startswith('#'): continue
            if _ip == '': continue
            ip = _ip.split(';')[0].strip()
            updatetime = _ip.split(';')[1].strip().split()[0].strip()
            block = [ip, updatetime, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        if len(ips) > 0:
            try:
                for ip, updatetime, source in ips:
                    flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.updatetime = updatetime
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = updatetime
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("cybersweat 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("cybersweat 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
コード例 #12
0
class CrawlRansomwaretracker(Task):
    def __init__(self):
        super().__init__('ransomwaretracker 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []
        url = 'https://ransomwaretracker.abuse.ch/downloads/RW_DOMBL.txt'
        source = 'ransomwaretracker.abuse.ch'
        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   "+source)
            return None
        info = _info.split('\n')
        description = 'Ransomware'
        for domain in info:
            if domain.startswith('#'): continue
            if domain == '': continue
            block = [domain, description, source]
            domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):
        start = time.time()
        all_count = len(domains)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(domains) > 0:
            try:
                for domain, description, source in domains:
                        flag = db.session.query(Domain).filter(Domain.domain == domain, Domain.source == source).first()

                        if flag is None:
                            new_domain = Domain()
                            new_domain.domain = domain
                            new_domain.description = description
                            new_domain.source = source
                            db.session.add(new_domain)
                            avail_count += 1
                        else:
                            flag.updatetime = _time
                            db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("ransomwaretracker 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("ransomwaretracker 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
コード例 #13
0
class CrawlRulez(Task):
    def __init__(self):
        super().__init__('rulez 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'http://danger.rulez.sk/projects/bruteforceblocker/blist.php'
        source = 'danger.rulez.sk'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for _ip in info:
            if _ip.startswith('#'): continue
            if _ip == '': continue
            ip = _ip.split('#')[0].strip()
            updatetime = _ip.split('#')[1].strip().split()[0].strip()
            block = [ip, updatetime, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        if len(ips) > 0:
            try:
                for ip, updatetime, source in ips:
                    flag = db.session.query(IP).filter(
                        IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.updatetime = updatetime
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = updatetime
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("rulez 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("rulez  抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
コード例 #14
0
class CrawlCnnvd(Task):
    def __init__(self):
        super().__init__('douban数据爬取')
        self.logger = ContextLogger('douban')

    def run_crawl(self):
        begin_base_url = 'https://book.douban.com/tag/'
        middle_base_url = '?start='
        end_base_url = '&type=S'

        for tag in tag_list:
            start = 0
            while True:
                page_url = begin_base_url + tag + middle_base_url + str(
                    start) + end_base_url
                print(page_url)
                html = self.get(page_url)

                start += 20
                if html:
                    result = self.handle_list_html(html, tag)
                    if result:
                        break

    def handle_list_html(self, html, tag):
        soup = BeautifulSoup(html, 'lxml')
        items = soup.find_all(class_='subject-item')
        if not items:
            return True
        href_list = []
        for item in items:
            href = item.div.a['href']
            href_list.append(href)
        for href in href_list:

            html = self.get(href)

            if html:
                result = self.handle_info_html(html, tag)
                if result:
                    continue
        return False

    def handle_info_html(self, html, type_tag):

        soup = BeautifulSoup(html, 'lxml')
        book = Book()

        # type_id = db.session.query(Type).filter_by(title=tag).first().id
        try:
            title = soup.h1.span.get_text()
            info = soup.find(class_='article').find(class_='indent').find(
                class_='subjectwrap clearfix').find(
                    class_='subject clearfix').find(id='info')
            string = info.get_text().strip()
            string = string.replace(' ', '')
            string = string.replace(' ', '')
            string = string.replace('\n', '')
            tag_list = [
                '出版社:', '出品方:', '副标题:', '原作名:', '译者:', '出版年:', '页数:', '定价:',
                '装帧:', '丛书:', 'ISBN:'
            ]
            value_list = []
            if '作者:' in string:
                string = string.replace('作者:', '')

            flag = 0
            for tag in tag_list:
                if tag in string:
                    value = string.split(tag)[0]
                    value_list.append(value)
                    if flag != 0:
                        for i in range(flag):
                            value_list.append('')
                        flag = 0
                else:
                    flag += 1
                    continue
                string = string.split(tag)[1]
                if tag == 'ISBN:':
                    value_list.append(string)

            author = value_list[0]
            publisher = value_list[1]
            producer = value_list[2]
            subtitle = value_list[3]
            original_title = value_list[4]
            translator = value_list[5]
            year_of_publisher = value_list[6]
            pages = value_list[7]
            price = value_list[8]
            binding = value_list[9]
            series = value_list[10]
            isbn = value_list[11]

            pic_href = soup.find(class_='article').find(class_='indent').find(
                class_='subjectwrap clearfix').find(
                    class_='subject clearfix').find(id='mainpic').a['href']

            score = soup.find(
                class_='rating_self clearfix').strong.get_text().strip()

            score_people = soup.find(class_='rating_people').get_text()

            related_info = soup.find(class_='related_info')

            infos = related_info.find_all(class_='indent')[:2]

            content_info = str(infos[0].find(class_='intro')).replace(
                '<div class="intro">', '')
            author_info = str(infos[1].find(class_='intro')).replace(
                '<div class="intro">', '')

            book.title = title
            book.author = author
            book.publisher = publisher
            book.producer = producer
            book.translator = translator
            book.subtitle = subtitle
            book.original_title = original_title
            book.year_of_publisher = year_of_publisher
            book.pages = pages
            book.price = price
            book.binding = binding
            book.series = series
            book.isbn = isbn
            book.score = score
            book.score_people = score_people
            book.type = type_tag
            book.content_info = content_info
            book.author_info = author_info
            book.pic_href = pic_href

            self.safe_commit(book)
        except Exception as e:
            self.logger.warning('爬起失败', e)
            return True
        return False
コード例 #15
0
class CrawlCnnvd(Task):
    def __init__(self):
        super().__init__('cnnvd数据爬取')
        self.logger = ContextLogger('task_cnnvd')

    def get_api_id(self, cnnvd_str):
        str_1, str_2, str_3 = cnnvd_str.split('-')
        if len(str_3) == 3:
            return str_2 + '0' + str_3
        return str_2 + str_3

    def api_affect_product(self, id, name_str):
        data = {'cvCveid': id, 'counts': 5}
        json_datas = self.post('http://www.cnnvd.org.cn/web/xxk/getEntity.tag',
                               data=data)
        if json_datas == '' or json_datas is None:
            return name_str
        else:
            json_datas = json.loads(json_datas)
            for json_data in json_datas:
                name_str = name_str + json_data['cpr_product_name'] + ';'
            return name_str

    def api_patchs(self, id, patch_dict):
        data = {'cvCveid': id, 'counts': 5}
        json_datas = self.post('http://www.cnnvd.org.cn/web/xxk/getEntity.tag',
                               data=data)
        if json_datas == '' or json_datas is None:
            return patch_dict
        else:
            json_datas = json.loads(json_datas)
            for json_data in json_datas:
                patch_name = json_data['cp_cname']
                patch_url = 'http://www.cnnvd.org.cn' + '/web/xxk/bdxqById.tag?id=' + str(
                    json_data['cp_id'])
                patch_dict[patch_name] = patch_url
            return patch_dict

    def run_crawl(self):
        begin_base_url = 'http://www.cnnvd.org.cn' + '/web/vulnerability/querylist.tag?pageno='
        end_base_url = '&repairLd='
        index = 1
        while index < 10597:
            next_page = begin_base_url + str(index) + end_base_url
            print(next_page)
            html = self.get_one_page(next_page)
            if html:
                result = self.handle_list_html(html)
                if result:
                    break
            index += 1

    def handle_list_html(self, html):
        soup = BeautifulSoup(html, 'lxml')
        for url in soup.find(class_='list_list').ul.find_all('li'):
            vul_url = 'http://www.cnnvd.org.cn' + url.div.a['href']

            html = self.get_one_page(vul_url)
            if html:
                result = self.handle_info_html(html, vul_url)
                if result:
                    return result
        return False

    def handle_info_html(self, html, url):
        soup = BeautifulSoup(html, 'lxml')
        try:
            simple_list = []
            title = soup.find(
                class_="detail_xq w770").find('h2').get_text().strip()
            cnnvd_id = soup.find(class_="detail_xq w770").find('ul').find_all(
                'li')[0].span.get_text().strip().split(':')[1]
            for li in soup.find(
                    class_="detail_xq w770").find('ul').find_all('li')[1:]:
                if li.a:
                    simple_list.append(li.a.get_text().strip())

            # 如果时间超过一个月, 返回True,暂停爬取
            if simple_list[5] < end_time:
                return True

            vulner_source = soup.find(class_="detail_xq w770").find(
                'ul').find_all('li')[-1].span.get_text().strip().split(':')[1]
            list = []
            for div in soup.find(class_="fl w770").find_all(class_='d_ldjj'):
                str_value = ''
                for p in div.find_all('p'):
                    str_value += p.get_text().strip()
                list.append(str_value)
            if list[3] != '暂无':
                str_value = ''
                affect_products = soup.find(class_="fl w770").find_all(
                    class_='d_ldjj')[3].find_all(class_='a_title2')
                for i in affect_products:
                    str_value += i.get_text().strip() + ';'
                if len(affect_products) == 5:
                    str_value = self.api_affect_product(
                        self.get_api_id(cnnvd_id), str_value)
                list[3] = str_value
            if list[4] != '暂无':
                patch_dict = {}
                patchs = soup.find(class_="fl w770").find_all(
                    class_='d_ldjj')[4].find_all(class_='a_title2')
                for i in patchs:
                    name = i.get_text().strip()
                    patch_url = 'http://www.cnnvd.org.cn' + i['href']
                    patch_dict[name] = patch_url
                if len(patchs) == 5:
                    patch_dict = self.api_patchs(self.get_api_id(cnnvd_id),
                                                 patch_dict)
                list[4] = patch_dict
        except Exception as e:
            self.logger.warning(url + '--' + str(e))
            return False

        vul = db.session.query(Vulner).filter_by(cnnvd_id=cnnvd_id).first()
        if vul:
            print("查询成功")
        else:
            print("新数据")
            vul = Vulner()
        vul.title = title
        vul.vulner_source = vulner_source
        vul.cnnvd_id = cnnvd_id
        vul.url = url
        vul.level = simple_list[0]
        vul.cve_id = simple_list[1]
        vul.vulner_type = simple_list[2]
        vul.posted_time = simple_list[3]
        vul.threats_type = simple_list[4]
        vul.update_time = simple_list[5]
        vul.describe = list[0]
        vul.source = 'cnnvd'
        vul.solve_way = list[1]
        vul.refer_link = list[2]
        vul.affect_product = list[3]
        vul.patch = list[4]

        print(vul.title, '\n', vul.update_time, '\n', vul.cve_id, '\n',
              vul.url, '\n\n')

        self.safe_commit(vul)

        return False
コード例 #16
0
class CrawlAlienvault(Task):
    def __init__(self):
        super().__init__('alienvault 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        url = 'https://reputation.alienvault.com/reputation.generic'
        source = 'reputation.alienvault.com'

        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for _ip in info:
            if _ip.startswith('#'): continue
            if _ip == '': continue
            ip = re.findall(r'[0-9]+(?:\.[0-9]+){3}', _ip)[0]
            des = re.findall(r'# (.*?)(([A-Z]{2})|(\s)?),', _ip)
            description = des[0][0]
            area = des[0][1] + ',' + re.findall(r',(.*?),', _ip)[0]
            block = [ip, description, area, source]
            ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(ips) > 0:
            try:
                for ip, description, area, source in ips:
                    flag = db.session.query(IP).filter(
                        IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.description = description
                        new_ip.area = area
                        new_ip.source = source
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("alienvault 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("alienvault 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
コード例 #17
0
class Task(BaseReq):
    def __init__(self, name, time=None):
        super().__init__()
        self.logger = ContextLogger('task')
        self.name = name
        self.time = time
        self.cookies = {}
        self.headers = {}

    def start(self):
        self.logger.info("Task Started: {}".format(self.name))

    def exit(self):
        sys.exit()

    def finish(self):
        self.logger.info("Task Finished: {}".format(self.name))

    def run(self):
        self.start()
        self.run_crawl()
        self.finish()

    def safe_commit(self, value):
        try:
            db.session.add(value)
            db.session.commit()
            self.logger.info(value.job_type + value.createTime + '提交成功')
        except Exception as e:
            self.logger.warning(e)
            db.session.rollback()

    def get_one_page(self, url):
        """
        Crawl a url and return the HTML code for this url.

        Args:
            url: The url you need to crawl.

        Returns:
            the HTML code for this url
        """
        # try:
        #     response = requests.get(url, headers=self.headers)
        # except Exception as e:
        #     self.logger.warning(url + ' ' + str(e))
        #     return False
        # return response.text
        response = self.get(url)
        return response

    def run_crawl(self):
        """
        Traverse all the pages you need to crawl.
        """
        pass

    def handle_list_html(self, html, tag):
        """
        The analysis page gets all the urls you need to crawl.

        Args:
            html: The page HTML that needs to be analyzed
        """
        pass

    def handle_info_html(self, html, url, tag):
        """
        The analysis page extracts all vulnerability information and commit to database.

        Args:
            html: The page HTML that needs to be analyzed
            url: The page url that needs to be analyzed
        """
        pass
コード例 #18
0
class CrawlHackernews(Task):
    def __init__(self):
        super().__init__('hackernews数据爬取')
        self.logger = ContextLogger('task_hackernews')

    def run_crawl(self):
        base_url = 'http://hackernews.cc/page/'
        index = 1
        while index < 275:
            next_page = base_url + str(index)
            html = self.get_one_page(next_page)
            if html:
                result = self.handle_list_html(html)
                if result:
                    break
            index += 1

    def handle_list_html(self, html):
        soup = BeautifulSoup(html, 'lxml')
        for article in soup.find(class_="classic-lists clearfix").find_all(
                id='article'):
            article_url = article.find(class_="classic-list-left").a['href']
            html = self.get_one_page(article_url)
            if html:
                result = self.handle_info_html(html, article_url)
                if result:
                    return result
        return False

    def handle_info_html(self, html, url):
        soup = BeautifulSoup(html, 'lxml')
        try:
            info_url = db.session.query(Infomation).filter_by(url=url).first()
            if info_url:
                return True
            title = soup.find(class_="post-details-right").find(
                class_="single-title").get_text().strip()
            posted_time = soup.find(class_="post-details-right").find(
                class_="light-post-meta").find_all('a')[1].get_text().strip()
            author = soup.find(class_="post-details-right").find(
                class_="light-post-meta").find_all('a')[0].get_text().strip()
            summary = soup.find(
                class_="post-body clearfix").find('p').get_text().strip()
            source = 'HackerNews'
            keys = ''
            for category in soup.find(class_="post-details-right").find(
                    class_="light-post-meta").find(
                        class_="post-category").find_all('a'):
                keys = keys + category.get_text().strip() + ';'
        except Exception as e:
            self.logger.warning(url + ' ' + str(e))
            return False
        info = Infomation()
        info.source = source
        info.summary = summary
        info.keys = keys
        info.author = author
        info.posted_time = posted_time
        info.title = title
        info.url = url
        print(info.title, '\n', info.author, '\n', info.posted_time, '\n',
              info.url, '\n', info.keys, '\n', info.source, '\n', info.summary,
              '\n')
        self.safe_commit(info)
        return False
コード例 #19
0
class Task(BaseReq):
    def __init__(self, name, time=None):
        super().__init__()
        self.logger = ContextLogger('task')
        self.name = name
        self.time = time
        self.header = {
            'Accept-Language':
            'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:52.0) Gecko/20100101'
        }
        self.cookies = {}

    def start(self):
        self.logger.info("Task Started: {}".format(self.name))

    def exit(self):
        sys.exit()

    def finish(self):
        self.logger.info("Task Finished: {}".format(self.name))

    def run(self):
        self.start()
        self.run_crawl()
        self.finish()

    def safe_commit(self, value):
        try:
            db.session.add(value)
            db.session.commit()
            self.logger.info(value.title + '提交成功')
        except Exception as e:
            self.logger.warning(e)
            db.session.rollback()

    def get_one_page(self, url):
        """
        Crawl a url and return the HTML code for this url.

        Args:
            url: The url you need to crawl.

        Returns:
            the HTML code for this url
        """
        # try:
        #     response = requests.get(url, headers=self.headers)
        # except Exception as e:
        #     self.logger.warning(url + ' ' + str(e))
        #     return False
        # return response.text
        response = self.get(url)
        return response

    def run_crawl(self):
        """
        Traverse all the pages you need to crawl.
        """
        pass

    def handle_list_html(self, html):
        """
        The analysis page gets all the urls you need to crawl.

        Args:
            html: The page HTML that needs to be analyzed
        """
        pass

    def handle_info_html(self, html, url):
        """
        The analysis page extracts all vulnerability information and commit to database.

        Args:
            html: The page HTML that needs to be analyzed
            url: The page url that needs to be analyzed
        """
        pass

    def is_ip(self, _str):
        p = re.compile(
            '^((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)$'
        )
        if p.match(_str):
            return True
        else:
            return False
コード例 #20
0
ファイル: crawl_lg_all.py プロジェクト: huang-zp/Analyse_Job
class CrawlFreebuf(Task):
    def __init__(self):
        super().__init__('LaGou数据爬取')
        self.logger = ContextLogger('task_lagou')
        self.is_crawl = False
        self.headers = {
            'Host': 'www.lagou.com',
            'Upgrade - Insecure - Requests': '1',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Referer':
            'https://www.lagou.com/jobs/list_Python?px=new&city=%E5%85%A8%E5%9B%BD',
            'Connection': 'keep-alive',
            'Cookie':
            'user_trace_token=20171103191801-9206e24f-9ca2-40ab-95a3-23947c0b972a; _ga=GA1.2.545192972.1509707889; LGUID=20171103191805-a9838dac-c088-11e7-9704-5254005c3644; JSESSIONID=ABAAABAACDBABJB2EE720304E451B2CEFA1723CE83F19CC; _gat=1; LGSID=20171228225143-9edb51dd-ebde-11e7-b670-525400f775ce; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DKkJPgBHAnny1nUKaLpx2oDfUXv9ItIF3kBAWM2-fDNu%26ck%3D3065.1.126.376.140.374.139.129%26shh%3Dwww.baidu.com%26sht%3Dmonline_3_dg%26wd%3D%26eqid%3Db0ec59d100013c7f000000055a4504f6; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; LGRID=20171228225224-b6cc7abd-ebde-11e7-9f67-5254005c3644; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_search; SEARCH_ID=3ec21cea985a4a5fa2ab279d868560c8',
            'X-Requested-With': 'XMLHttpRequest',
            'X-Anit-Forge-Token': 'None',
            'X-Anit-Forge-Code': '0',
            'Pragma': 'no-cache',
            'Cache-Control': 'no-cache',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        }

    def process(self, page, tag):

        base_url = 'https://www.lagou.com/jobs/positionAjax.json?'
        params = {
            'px': 'new',
            'needAddtionalResult': 'false',
            'isSchoolJob': '0'
        }
        data = {'first': 'false', 'pn': str(page), 'kd': tag}
        html = self.post(base_url, data=data, params=params)

        if html:
            try:
                count = int(html['content']['positionResult']['resultSize'])
                job_list = html['content']['positionResult']['result']
                if count == 0:
                    return True
                for job in job_list:
                    self.storage(job, tag)
            except Exception as e:
                self.logger.warning('取值失败', e)
                self.logger.warning(base_url)

    def storage(self, job_dict, tag):

        for key in job_dict:
            job_dict[key] = self.switch_str(job_dict[key])
        job = Job()
        try:
            job.companyId = job_dict['companyId']
            job.positionName = job_dict['positionName']
            job.workYear = job_dict['workYear']
            job.education = job_dict['education']
            job.jobNature = job_dict['jobNature']
            job.companyLogo = job_dict['companyLogo']
            job.salary = job_dict['salary']
            job.city = job_dict['city']
            job.financeStage = job_dict['financeStage']
            job.industryField = job_dict['industryField']
            job.positionId = job_dict['positionId']
            job.approve = job_dict['approve']
            job.createTime = job_dict['createTime']
            job.positionAdvantage = job_dict['positionAdvantage']
            job.companySize = job_dict['companySize']
            job.companyLabelList = job_dict['companyLabelList']
            job.publisherId = job_dict['publisherId']
            job.score = job_dict['score']
            job.district = job_dict['district']
            job.companyShortName = job_dict['companyShortName']
            job.positionLables = job_dict['positionLables']
            job.industryLables = job_dict['industryLables']
            job.businessZones = job_dict['businessZones']
            job.longitude = job_dict['longitude']
            job.latitude = job_dict['latitude']
            job.adWord = job_dict['adWord']
            job.formatCreateTime = job_dict['formatCreateTime']
            job.hitags = job_dict['hitags']
            job.resumeProcessRate = job_dict['resumeProcessRate']
            job.resumeProcessDay = job_dict['resumeProcessDay']
            job.companyFullName = job_dict['companyFullName']
            job.imState = job_dict['imState']
            job.lastLogin = job_dict['lastLogin']
            job.explain = job_dict['explain']
            job.plus = job_dict['plus']
            job.pcShow = job_dict['pcShow']
            job.appShow = job_dict['appShow']
            job.deliver = job_dict['deliver']
            job.gradeDescription = job_dict['gradeDescription']
            job.promotionScoreExplain = job_dict['promotionScoreExplain']
            job.firstType = job_dict['firstType']
            job.secondType = job_dict['secondType']
            job.isSchoolJob = job_dict['isSchoolJob']
            job.subwayline = job_dict['subwayline']
            job.stationname = job_dict['stationname']
            job.linestaion = job_dict['linestaion']
            job.job_type = tag
        except Exception as e:
            self.logger.warning('存储失败', e)
            self.logger.warning(job_dict['publisherId'])
        self.safe_commit(job)

    def switch_str(self, value_list):
        value_str = ''
        try:
            if type(value_list) != list:
                return str(value_list)

            for value in value_list:
                value_str = value_str + value + ';'
        except Exception as e:
            self.logger.warning('转化失败', e)
        return value_str

    def control(self):
        tag_list = []
        with open(FILE_PATH, 'r') as f:
            for line in f:
                tag_list.append(line.strip())
        for tag in tag_list:
            page = 1
            while (True):
                result = self.process(page, tag)
                if result:
                    break
                print("当前页数{}".format(page))
                page += 1
            self.logger.warning(tag + '共发起了' + str(page) + '请求')
コード例 #21
0
ファイル: crawl_zl.py プロジェクト: huang-zp/Analyse_Job
class CrawlZJ(Task):
    def __init__(self):
        super().__init__('智联招聘数据爬取')
        self.logger = ContextLogger('task_zhilian')

    def run_crawl(self):
        tag_list = []
        with open(FILE_PATH, 'r') as f:
            for line in f:
                tag_list.append(line.strip().lower())
        for tag in tag_list:
            base_url1 = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E5%85%A8%E5%9B%BD&kw="'
            base_url2 = '&sb=1&sm=0&isfilter=0&fl=489&isadv=0&sg=0e48987408d24799b14e9053dd9a00c7&p='
            index = 1
            while index < 91:
                next_page = base_url1 + tag + base_url2 + str(index)
                html = self.get_one_page(next_page)
                if html:
                    result = self.handle_list_html(html, tag)
                    if result:
                        break
                index += 1

    def handle_list_html(self, html, tag):
        soup = BeautifulSoup(html, 'lxml')
        try:
            list_content = soup.find(
                class_="newlist_list_content").find_all("table")
        except Exception as e:
            self.logger.warning(e)
            return False
        for i, table in enumerate(list_content):
            if i == 0:
                continue
            try:
                position_url = table.tr.td.div.a['href']
                position_name = table.tr.td.div.a.get_text().strip()
            except Exception as e:
                self.logger.warning(e)
                return False
            if tag not in position_name.lower():
                return True
            html = self.get_one_page(position_url)
            if html:
                result = self.handle_info_html(html, position_url, tag)
                if result:
                    return result
        return False

    def handle_info_html(self, html, url, tag):
        soup = BeautifulSoup(html, 'lxml')
        try:
            url = url
            positionName = soup.find(
                class_="inner-left fl").h1.get_text().strip()
            companyFullName = soup.find(
                class_="inner-left fl").h2.get_text().strip()

            companyLabelList = ''
            for i in soup.find(class_="inner-left fl").div.find_all('span'):
                companyLabelList = companyLabelList + i.get_text().strip(
                ) + ';'

            list_attr = []
            for i in soup.find(class_="terminalpage-left").find(
                    class_='terminal-ul clearfix').find_all('li'):
                list_attr.append(i.strong.get_text().strip())
        except Exception as e:
            self.logger.warning(url + ' ' + str(e))
            return False
        job = ZlJob()
        job.url = url
        job.positionName = positionName
        job.companyFullName = companyFullName
        job.companyLabelList = companyLabelList
        job.salary = list_attr[0]
        job.city = list_attr[1]
        job.createTime = list_attr[2]
        job.jobNature = list_attr[3]
        job.workYear = list_attr[4]
        job.education = list_attr[5]
        job.positionCount = list_attr[6]
        job.firstType = list_attr[7]
        job.job_type = tag

        self.safe_commit(job)
        return False
コード例 #22
0
class CrawlDataplane(Task):
    def __init__(self):
        super().__init__('dataplane 数据爬取')
        self.logger = ContextLogger('threat_ip')

    def run_crawl(self):
        start = time.time()
        ips = []
        urls = ['https://dataplane.org/dnsrd.txt',
                'https://dataplane.org/dnsrdany.txt',
                'https://dataplane.org/dnsversion.txt',
                'https://dataplane.org/sipinvitation.txt',
                'https://dataplane.org/sipquery.txt',
                'https://dataplane.org/sipregistration.txt',
                'https://dataplane.org/sshclient.txt',
                'https://dataplane.org/sshpwauth.txt',
                'https://dataplane.org/vncrfb.txt'
                ]
        source = 'dataplane.org'
        for url in urls:
            _info = self.get(url=url)

            if _info is None:
                self.logger.warning("request returned None   " + source)
                return None
            info = _info.split('\n')
            for _ip in info:
                if _ip.startswith('#'): continue
                if _ip == '': continue
                asn = _ip.split('|')[0].strip()
                asname = _ip.split('|')[1].strip()
                ip = _ip.split('|')[2].strip()
                updatetime = _ip.split('|')[3].strip().split()[0]
                category = _ip.split('|')[4].strip()
                block = [ip, updatetime, source, asname, asn, category]
                ips.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(ips, source, crawl_time)

    def save_info(self, ips, source, crawl_time):
        start = time.time()
        all_count = len(ips)
        avail_count = 0
        if len(ips) > 0:
            try:
                for ip, updatetime, source, asname, asn, category in ips:
                    flag = db.session.query(IP).filter(IP.ip == ip, IP.source == source).first()

                    if flag is None:
                        new_ip = IP()
                        new_ip.ip = ip
                        new_ip.updatetime = updatetime
                        new_ip.source = source
                        new_ip.asname = asname
                        new_ip.asn = asn
                        new_ip.category = category
                        db.session.add(new_ip)
                        avail_count += 1
                    else:
                        flag.updatetime = updatetime
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) + source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("dataplane 共收集{0}条数据, 新数据{1}条".format(all_count, avail_count))
        self.logger.info("dataplane 抓取时间{0},数据遍历时间{1}".format(crawl_time, storage_time))
コード例 #23
0
class CrawlMalwaredomainlist(Task):
    def __init__(self):
        super().__init__('dshield 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []
        urls = [
            'https://secure.dshield.org/feeds/suspiciousdomains_High.txt',
            'https://secure.dshield.org/feeds/suspiciousdomains_Medium.txt',
            'https://secure.dshield.org/feeds/suspiciousdomains_Low.txt'
        ]
        source = 'secure.dshield.org'
        for url in urls:

            obj = urlparse(url)
            description = obj.path.split('/')[-1].split('.')[0]
            _info = self.get(url=url)
            if _info is None:
                self.logger.warning("request returned None   " + source)
                return None
            info = _info.split('\n')
            for domain in info:
                if domain.startswith('#'): continue
                if domain == 'Site': continue
                block = [domain, source, description]
                domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):

        start = time.time()
        all_count = len(domains)
        avail_count = 0
        _time = datetime.now().strftime("%Y-%m-%d")
        if len(domains) > 0:
            try:
                for domain, source, description in domains:
                    flag = db.session.query(Domain).filter(
                        Domain.domain == domain,
                        Domain.source == source).first()

                    if flag is None:
                        new_domain = Domain()
                        new_domain.domain = domain
                        new_domain.source = source
                        new_domain.description = description
                        db.session.add(new_domain)
                        avail_count += 1
                    else:
                        flag.updatetime = _time
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("dshield 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("dshield 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
コード例 #24
0
class CrawlCybercrime(Task):
    def __init__(self):
        super().__init__('cybercrime 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []
        url = 'http://cybercrime-tracker.net/ccam.php'
        source = 'cybercrime-tracker.net'
        _info = self.get(url=url)

        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        soup = BeautifulSoup(_info, 'lxml')
        table = soup.findChildren('tbody')[2]
        rows = table.findChildren('tr', attrs={'class': 'monitoring'})
        for row in rows:
            date_str = row.findChildren('td')[1].string
            time_obj = time.strptime(date_str, "%d/%m/%Y %H:%M:%S")
            updatetime = time.strftime("%Y-%m-%d", time_obj)
            domain = row.findChildren('td')[2].string
            hashstr = row.findChildren('td')[3].string
            if self.is_ip(domain): continue
            block = [domain, updatetime, source]
            domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):

        start = time.time()
        all_count = len(domains)
        avail_count = 0

        if len(domains) > 0:
            try:
                for domain, updatetime, source in domains:
                    flag = db.session.query(Domain).filter(
                        Domain.domain == domain,
                        Domain.source == source).first()

                    if flag is None:
                        new_domain = Domain()
                        new_domain.domain = domain
                        new_domain.updatetime = updatetime
                        new_domain.source = source
                        db.session.add(new_domain)
                        avail_count += 1
                    else:
                        flag.updatetime = updatetime
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("cybercrime 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("cybercrime 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))
コード例 #25
0
class CrawlFreebuf(Task):
    def __init__(self):
        super().__init__('freebuf数据爬取')
        self.logger = ContextLogger('task_freebuf')

    def run_crawl(self):
        base_url = 'http://www.freebuf.com/news/page/'
        index = 1
        while index < 290:
            next_page = base_url + str(index)
            html = self.get_one_page(next_page)
            if html:
                result = self.handle_list_html(html)
                if result:
                    break
            index += 1

    def handle_list_html(self, html):
        soup = BeautifulSoup(html, 'lxml')
        for tag_div in soup.find_all("div", class_="news-img"):
            article_url = tag_div.a['href']
            print(article_url)
            html = self.get_one_page(article_url)
            if html:
                result = self.handle_info_html(html, article_url)
                if result:
                    return result
        return False

    def handle_info_html(self, html, url):
        print(url)
        soup = BeautifulSoup(html, 'lxml')
        try:
            url = url
            info_url = db.session.query(Infomation).filter_by(url=url).first()
            if info_url:
                print("已经存在")
                return True
            posted_time = soup.find(class_="property").find(
                class_="time").get_text().strip()
            title = soup.find(class_="articlecontent").find(
                class_="title").h2.get_text().strip()
            author = soup.find(class_="property").find(
                rel="author").get_text().strip()
            keys_list = soup.find(class_="property").find(
                class_="tags").find_all('a')
            key_str = ''
            for key in keys_list:
                key_str = key_str + key.get_text().strip() + ';'
            summary = ''
            for p_string in soup.find(id="contenttxt").find_all(
                    style=re.compile("color: rgb\(0, 176, 80\);*$")):
                if p_string.get_text() is None:
                    continue
                summary += p_string.get_text().strip()
            if summary == '':
                summary = soup.find(
                    id="contenttxt").find('p').get_text().strip()
        except Exception as e:
            self.logger.warning(url + ' ' + str(e))
            return False
        info = Infomation()
        info.title = title
        info.url = url
        info.posted_time = posted_time
        info.author = author
        info.summary = summary
        info.source = 'Freebuf'
        info.keys = key_str
        print(info.title, info.author, info.posted_time, info.url, key_str)
        self.safe_commit(info)
        return False
コード例 #26
0
class CrawlNetlab(Task):
    def __init__(self):
        super().__init__('netlab 数据爬取')
        self.logger = ContextLogger('threat_domain')

    def run_crawl(self):
        start = time.time()
        domains = []
        url = 'http://data.netlab.360.com/feeds/dga/dga.txt'
        # url='http://67.209.191.170:81/1.txt'
        source = 'data.netlab.360.com'
        _info = self.get(url=url)
        if _info is None:
            self.logger.warning("request returned None   " + source)
            return None
        info = _info.split('\n')
        for _domain in info:
            if _domain.startswith('#'): continue
            if _domain == '': continue
            domain = _domain.split()[1]
            description = _domain.split()[0]
            updatetime = _domain.split()[2]
            block = [domain, updatetime, description, source]
            domains.append(block)
        stop = time.time()
        crawl_time = str(stop - start) + "秒"
        self.save_info(domains, source, crawl_time)

    def save_info(self, domains, source, crawl_time):

        start = time.time()
        all_count = len(domains)
        avail_count = 0
        if len(domains) > 0:
            try:
                for domain, updatetime, description, source in domains:
                    flag = db.session.query(Domain).filter(
                        Domain.domain == domain,
                        Domain.source == source).first()

                    if flag is None:
                        new_domain = Domain()
                        new_domain.domain = domain
                        new_domain.updatetime = updatetime
                        new_domain.description = description
                        new_domain.source = source
                        db.session.add(new_domain)
                        avail_count += 1
                    else:
                        flag.updatetime = updatetime
                        db.session.add(flag)
                db.session.commit()
            except Exception as e:
                self.logger.warning("Error writing to database" + str(e) +
                                    source)
        else:
            self.logger.warning("NO record found from: %s" % source)
        stop = time.time()
        storage_time = str(stop - start) + "秒"

        self.logger.info("netlab 共收集{0}条数据, 新数据{1}条".format(
            all_count, avail_count))
        self.logger.info("netlab 抓取时间{0},数据遍历时间{1}".format(
            crawl_time, storage_time))