Exemple #1
0
def start():
    """程序入口"""
    logger = LoggerUtils().loglog('../logs/test.log')
    #测试redis_utils中的方法
    redis = RedisUtils(REDIS_HOST, REDIS_PORT, REDIS_PASSWORD)
    redis.set_str('hello','11111')
    logger.info(redis.get_str('hello'))
    # a = redis.get_list('runoobkey')
    # for i in a:
    #     print(i.decode())

    #测试mysql_utils类中的方法
    mysql_conn = MysqlUtils(MYSQL_HOST, MYSQL_USERNAME, MYSQL_PASSWORD, MYSQL_PORT, MYSQL_DATABASE)
    sql = "select * from sys_user"
    a = mysql_conn.execute_query(sql)
    logger.info(a)

    #ftp工具类测试
    try:
        ftp = FTPUtils(FTP_HOST, FTP_PORT, FTP_USERNAME, FTP_PASSWORD)
        os.chdir("d:/")
        ftp.upload_file('centos7_init.sh', FTP_DIR)
    except Exception as e:
        logger.info(e)

    #csv工具类使用测试
    csv = CSV_Utils()
    os.chdir("d:/")
    csv.export_sql_result('result.csv', a)
    pass
Exemple #2
0
 def __init__(self, **kwargs):
     """
     :return: :class:Consumer object
     :rtype: Consumer
     """
     self.context = kwargs.pop('context')
     self.__cookie_file = kwargs.pop('cookie_file')
     self.redis_handle = RedisUtils(db=kwargs.pop('redis_db'),
                                    tld=kwargs.pop('tld'))
Exemple #3
0
 def __init__(self, **kwargs):
     """
     :return: :class:Producer object
     :rtype: Producer
     """
     self.context = kwargs.pop('context')
     self.__mongo_db = kwargs.pop('mongo_db')
     self.mongo_handle = None
     self.redis_handle = RedisUtils(db=kwargs.pop('redis_db'),
                                    tld=kwargs.pop('tld'))
Exemple #4
0
class Consumer(object):
    def __init__(self, **kwargs):
        """
        :return: :class:Consumer object
        :rtype: Consumer
        """
        self.context = kwargs.pop('context')
        self.__cookie_file = kwargs.pop('cookie_file')
        self.redis_handle = RedisUtils(db=kwargs.pop('redis_db'),
                                       tld=kwargs.pop('tld'))

    def consume(self):
        if not self.redis_handle.connected:
            logger.error('no redis connection found in consumer! exit.')
            return
        while True:
            try:
                url = self.redis_handle.fetch_one_task()
                with self.context['lock']:
                    self.context['live_spider_counts'].value += 1
                    self.context['task_counts'].value -= 1
                logger.info('get task url: %s' % url)
                logger.info('%d tasks left' %
                            self.context['task_counts'].value)
                if not self.redis_handle.is_blocked(URL(url)):
                    self.start_spider(url, self.__cookie_file)
            except:
                logger.exception('consumer exception!')
                if not self.redis_handle.connected:
                    logger.error('redis disconnected! reconnecting...')
                    self.redis_handle.connect()
                time.sleep(10)
            finally:
                with self.context['lock']:
                    self.context['live_spider_counts'].value -= 1

    def start_spider(self, url, cookie_file=None):
        results = SpiderPage(url, cookie_file=cookie_file).spider()
        with self.context['lock']:
            self.context['result_counts'].value += len(results)
        for _ in results:
            self.redis_handle.insert_result(_)
Exemple #5
0
buil scanned pattern cache for redis from mongodb

Copyright (c) 2016-2017 [email protected] (http://twi1ight.com/)
See the file 'doc/COPYING' for copying permission
"""
import sys

sys.path.append('../')
from core.utils.mongo_utils import MongoUtils
from core.utils.redis_utils import RedisUtils
from core.utils.url import URL

reload(sys)
sys.setdefaultencoding('utf-8')
m = MongoUtils()
r = RedisUtils()


def build_saved_cache():
    for doc in m.query({}, {"_id": 0, "method": 1, "url": 1}):
        url = URL(doc['url'])
        r.set_url_saved(doc['method'], url)

    for doc in m.query({}, {"_id": 0, "method": 1, "url": 1}, is_target=False):
        url = URL(doc['url'])
        r.set_url_saved(doc['method'], url)


if __name__ == '__main__':
    build_saved_cache()
Exemple #6
0
                    help='Mongodb database name, default "tspider"')
    db.add_argument('--redis-db',
                    metavar='NUMBER',
                    dest='redis_db',
                    type=int,
                    default=RedisConf.db,
                    help='Redis db index, default 0')
    args = parser.parse_args()
    if not any([args.url, args.file, args.keepon]):
        parser.exit(parser.format_help())
    return args


if __name__ == '__main__':
    args = cmdparse()
    redis_handle = RedisUtils(db=args.redis_db)
    if args.keepon:
        redis_handle.restore_startup_params(args)
        logger.info(args)
    if os.path.exists(TMPDIR_PATH):
        for f in os.listdir(TMPDIR_PATH):
            os.remove(os.path.join(TMPDIR_PATH, f))
    tspider_context = {
        'live_spider_counts': Value('i', 0),
        'task_counts': Value('i', 0),
        'result_counts': Value('i', 0),
        'task_done': Event(),
        'lock': Lock()
    }
    kwargs = {
        'tld': args.tld,
Exemple #7
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Created on 2016/8/7 16:17
add blacklist domain or subdomain in runtime

Copyright (c) 2016-2017 [email protected] (http://twi1ight.com/)
See the file 'doc/COPYING' for copying permission
"""
import sys

from core.utils.redis_utils import RedisUtils

if __name__ == '__main__':
    if len(sys.argv) != 3:
        print 'usage: block_domain.py db target.com'
        sys.exit()
    db = int(sys.argv[1])
    domain = sys.argv[2]
    r = RedisUtils(db=db)
    r.add_blocklist(domain)
    print 'add success!'
Exemple #8
0
class Producer(object):
    """
    Producer Class
    make targets for consumer
    save results to mongodb
    """
    def __init__(self, **kwargs):
        """
        :return: :class:Producer object
        :rtype: Producer
        """
        self.context = kwargs.pop('context')
        self.__mongo_db = kwargs.pop('mongo_db')
        self.mongo_handle = None
        self.redis_handle = RedisUtils(db=kwargs.pop('redis_db'),
                                       tld=kwargs.pop('tld'))

    def produce(self):
        # mongodb with multipleprocessing must be init after fork
        self.mongo_handle = MongoUtils(db=self.__mongo_db)
        if not self.redis_handle.connected or not self.mongo_handle.connected:
            logger.error('no redis/mongodb connection found! exit.')
            return

        while True:
            try:
                _, req = self.redis_handle.fetch_one_result()
                with self.context['lock']:
                    self.context['result_counts'].value -= 1
                logger.debug('got req, %d results left' %
                             self.context['result_counts'].value)
                self.proc_req(req)
            except:
                logger.exception('produce exception!')
                if not self.redis_handle.connected:
                    logger.error('redis disconnected! reconnecting...')
                    self.redis_handle.connect()
                if not self.mongo_handle.connected:
                    logger.error('mongodb disconnected! reconnecting...')
                    self.mongo_handle.connect()
                time.sleep(10)
            finally:
                with self.context['lock']:
                    if self.context['result_counts'].value == 0:
                        if self.context[
                                'live_spider_counts'].value == 0 and self.context[
                                    'task_counts'].value == 0:
                            self.context['task_done'].set()

    def proc_req(self, req):
        try:
            data = json.loads(req)
        except:
            logger.exception('json loads req error: %s' % req)
            return
        urlstring = data.get('url', '')
        if not urlstring:
            logger.error('empty url found!')
            return
        url = URL(urlstring)

        method = data.get('method', '')
        if not method:
            logger.error('not method found!')
            return
        # save to mongodb
        data.update({
            'pattern': url.pattern,
            'hostname': url.hostname,
            'domain': url.domain
        })
        target = self.redis_handle.is_target(url)

        if not self.redis_handle.is_url_saved(method, url):
            logger.debug('redis saved pattern not found!')
            self.mongo_handle.save(data, is_target=target)
            self.redis_handle.set_url_saved(method, url)
        else:
            logger.debug('redis saved pattern found!')

        if not target:
            logger.debug('%s is not target' % url.hostname)
            return

        # todo post req
        if method == 'POST':
            logger.debug('POST not support now')
        elif method == 'GET':
            # new host found, add index page to task queue
            if self.redis_handle.get_hostname_reqcount(url.hostname) == 0:
                self.create_task_from_url(URL(url.index_page),
                                          add_whitelist=False)
            # check url validation inside create_url_task
            self.create_task_from_url(url, add_whitelist=False)
        else:
            # not GET nor POST
            logger.error('HTTP Verb %s found!' % method)
            logger.debug(data)

    def create_task_from_url(self, url, **kwargs):
        with self.context['lock']:
            if self.redis_handle.create_task_from_url(url, **kwargs):
                self.context['task_counts'].value += 1

    def create_task_from_file(self, fileobj):
        """
        create task from file
        :param filename:
        :return:
        """
        with fileobj:
            for line in fileobj:
                line = line.strip()
                if not line: continue
                url = URL(line)
                self.create_task_from_url(url)