Beispiel #1
0
    def __init__(self, beanstalk_conf, log=None, process_pool=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True
        self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.input_tube = beanstalk_conf['input_tube']
        self.output_tube = beanstalk_conf['output_tube']
        self.log = log
        if not self.log:
            self.log = LogHandler("i_input_thread")

        self.process_pool = process_pool
        self.wlock = threading.Lock()
Beispiel #2
0
class InputThread(threading.Thread):
    def __init__(self, beanstalk_conf, log=None, process_pool=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True
        self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.input_tube = beanstalk_conf['input_tube']
        self.output_tube = beanstalk_conf['output_tube']
        self.log = log
        if not self.log:
            self.log = LogHandler("i_input_thread")

        self.process_pool = process_pool
        self.wlock = threading.Lock()

    def stop(self):
        self.log.warning("stop input_thread")
        self.running = False
        proccesor = None
        try:
            while True:
                if self.process_pool.get_task_num() == 0:
                    if self.process_pool.thread_local_constructors.has_key('processor'):
                        processor = self.process_pool.thread_local_constructors['processor'][1][1]
                        self.log.warning("prepare call scheduler_processor to stop scheduler")
                        processor.save_status()
                        break
                else:
                    self.log.info("wait tasks be consumed over, wait 5s")
                    time.sleep(5)

            self.beanstalk.__del__()  # 关闭连接不再接受数据
        except Exception, e:
            self.log.error("stop input_thread fail:%s" % e.message)
    def __init__(self, conf, processor, proc_name= None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True
        self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.input_tube = conf['beanstalk_conf']['input_tube']
        self.output_tube = conf['beanstalk_conf']['output_tube']

        self.log = conf['log']
        if not self.log:
            self.log = LogHandler("i_input_thread")
        self.processor = processor
        if self.processor is None:
            self.log.error("Processor not given !")
            raise Exception("Processor not given !")

        self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\
                                         {},\
                                         int(conf['server'].get("process_thread_num", 1))
                                         )
        self.wlock = threading.Lock()
Beispiel #4
0
    def init_log(self, conf, level=logging.DEBUG, console_out=False, name=None):
        if isinstance(level, basestring):
            level = level.lower()
        if level == "debug":
            level = logging.DEBUG
        elif level == "info":
            level = logging.INFO
        elif level == "warning":
            level = logging.WARNING
        elif level == "error":
            level = logging.ERROR
        else:
            level = logging.DEBUG
        if name is None:
            name = conf['server']['name']

        self.log = LogHandler(name, level, console_out=console_out)
Beispiel #5
0
    selector = CrawlSelector(conf['log'], conf['selector_conf'],
                             conf['beanstalk_conf'], None)
    selector.start()
    while True:
        time.sleep(20)
    selector.stop()


if __name__ == '__main__':
    try:
        file_path = './scheduler.toml'
        opt, args = getopt.getopt(sys.argv[1:], 'f:', ['help'])
        for name, value in opt:
            if name == "-f":
                file_path = value
            elif name in ("-h", "--help"):
                usage()
                sys.exit()
            else:
                assert False, "unhandled option"

        with open(file_path, 'rb') as config:
            conf = pytoml.load(config)
            conf['log'] = LogHandler(conf['server']['name'] +
                                     str(conf['server']['port']))

        main(conf)

    except getopt.GetoptError:
        sys.exit()
Beispiel #6
0
            except Exception, e:
                self.log.error("report activate failed, because %s" % e.message)
            time.sleep(self.EXPIREDS)

    def stop(self):
        self.running = False


if __name__ == "__main__":
    import logging
    from i_util.logs import LogHandler

    # log = LogHandler(config.logname, logging.DEBUG)


    log = LogHandler('test', logging.DEBUG)

    conf = {
        "backend": {"host": "101.201.102.37",
                    "port": 6379,
                    "password": "******"},
        "server_name": "test",
        "server": {"host": "127.0.0.1",
                   "port": 1001},
        "log": log
    }

    obj = HeartbeatThread("extractor", conf=conf)
    obj.run()
Beispiel #7
0
    'port': 6379,
    'database': 8,
    'proxy_name': 'proxies',
    'proxy_test_available': 24  #hours
}

SERVER = {'port': 8571, 'host': '0.0.0.0', 'debug': False}

MONGODB = {
    'host': '172.17.1.119',
    'port': 40042,
    'database': 'task_collect',
    'username': '******',
    'password': '******'
}

final_data = {
    'host': '172.17.1.119',
    'port': 40042,
    'name': 'app_data',
    'username': '******',
    'password': '******'
}
realtime_crawl = {
    'api': 'http://172.18.180.225:9823/gsxt_info',
    'query': 'company'
}

from i_util.logs import LogHandler
log = LogHandler("manager", loglevel=logging.INFO)
Beispiel #8
0
def main():
    logger = LogHandler('select_webpage')
Beispiel #9
0
reload(sys)
sys.setdefaultencoding("utf-8")
import os
import json
import traceback
import time
from thrift.protocol.TBinaryProtocol import TBinaryProtocol
from thrift.transport.TTransport import TMemoryBuffer

sys.path.append('..')
from i_util.pybeanstalk import PyBeanstalk
from bdp.i_crawler.i_downloader.ttypes import DownLoadReq
from i_util.logs import LogHandler
import config

log = LogHandler('re_crawler', console_out=True)
beanstalk_conf = config.beanstalk_conf
beanstalk_client = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])


def req_to_string(req):
    str_req = ""
    try:
        tMemory_b = TMemoryBuffer()
        tBinaryProtocol_b = TBinaryProtocol(tMemory_b)
        req.write(tBinaryProtocol_b)
        str_req = tMemory_b.getvalue()
    except:
        log.error('crawled_failt\terror:%s' % (traceback.format_exc()))
    return str_req
Beispiel #10
0
        log.warning("cann't write DownLoadRsp to string")
    return str_page_info


def put_beanstalked(beanstalk_conf, log, rsp):
    beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
    tube = beanstalk_conf['input_tube']
    str_page_info = to_string(log, rsp)
    try:
        beanstalk.put(tube, str_page_info)
        log.info('beanstalk\turl:%s\ttube:%s' % (rsp.url, tube))
    except Exception as e:
        log.info('beanstalk put error url:' + rsp.url + '\ttube:' + tube)


if __name__ == "__main__":
    url = 'dsfsdf'
    redirect_url = 'sdfdfsfsfs'
    status = 1
    http_code = 200
    rsp = DownLoadRsp(url=url,
                      redirect_url=redirect_url,
                      status=status,
                      http_code=http_code)
    beanstalk_conf = {}
    beanstalk_conf['host'] = '101.201.102.37'
    beanstalk_conf['port'] = 11300
    beanstalk_conf['input_tube'] = 'download_rsp_test'
    log = LogHandler('download_test')
    put_beanstalked(beanstalk_conf, log, rsp=rsp)
Beispiel #11
0
# -*- coding: utf-8 -*-
from conf import config

# logging是线程安全的
import logging
# logging.basicConfig(level = logging.DEBUG)
from i_util.logs import LogHandler

log = LogHandler(config.server['name']+str(config.server['port']), logging.INFO)
import MySQLdb


def get_mysqldb():
    # 1 mysql init
    mysql_db = MySQLdb.connect(
        host=config.MYSQL['host'],
        port=config.MYSQL['port'],
        user=config.MYSQL['username'],
        passwd=config.MYSQL['password'],
        db=config.MYSQL['database'],
        charset='utf8'
    )
    return mysql_db


def dbrecord_to_dict(descriptions, record):
    '''mysql元组记录转换为字典'''
    return {
        desc[0]: field_value
        for desc, field_value in zip(descriptions, record)
        }
class InputThread(threading.Thread):
    def __init__(self, conf, processor, proc_name= None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True
        self.beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.out_beanstalk = PyBeanstalk(conf['beanstalk_conf']['host'], conf['beanstalk_conf']['port'])
        self.input_tube = conf['beanstalk_conf']['input_tube']
        self.output_tube = conf['beanstalk_conf']['output_tube']

        self.log = conf['log']
        if not self.log:
            self.log = LogHandler("i_input_thread")
        self.processor = processor
        if self.processor is None:
            self.log.error("Processor not given !")
            raise Exception("Processor not given !")

        self.processor_pool = ThreadPool(conf['server'].get("process_thread_num", 1),\
                                         {},\
                                         int(conf['server'].get("process_thread_num", 1))
                                         )
        self.wlock = threading.Lock()

    def stop(self):
        self.log.warning("stop input_thread")
        self.running = False
        self.processor_pool.join_all()

    def run(self):
        job_num = 0
        self.running = True
        while self.running and self.input_tube:
            try:
                job = self.beanstalk.reserve(self.input_tube, 3)
                if not job is None:
                    job_num += 1
                    body = job.body
                    job.delete()
                    self.processor_pool.queue_task(self._on_task_start, (body,), self._on_task_finished)

            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                try:
                    self.beanstalk.reconnect()
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
            except:
                self.log.error("not msg from:%s\tresult:%s" % (self.input_tube, str(traceback.format_exc())))

    def _on_task_start(self, task, **thread_locals):
        result = None
        try:
            result = self.processor.do_task(task)
        except Exception as e:
            self.log.error(e.message)
        return result

    def _on_task_finished(self, (task), **thread_locals):
        self.wlock.acquire()
        if task and isinstance(task, basestring):
            self._output_msg(task)
        elif isinstance(task, list):
            for message in task:
                self._output_msg(message)
        self.wlock.release()
Beispiel #13
0
class InputThread(threading.Thread):
    def __init__(self, beanstalk_conf, log=None, process_pool=None):
        threading.Thread.__init__(self)
        self.daemon = True
        self.running = True

        assert beanstalk_conf is not None
        assert log is not None
        assert process_pool is not None

        self.beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.out_beanstalk = PyBeanstalk(beanstalk_conf['host'], beanstalk_conf['port'])
        self.input_tube = beanstalk_conf['input_tube']
        self.output_tube = beanstalk_conf['output_tube']
        self.log = log
        if not self.log:
            self.log = LogHandler("i_input_thread")

        self.process_pool = process_pool
        self.t_lock = threading.Lock()

    def stop(self):
        self.log.warning("stop input_thread")
        self.running = False
        try:
            while True:
                if self.process_pool.get_task_num() <= 0:
                    # if 'processor' in self.process_pool.thread_local_constructors:
                    #     processor = self.process_pool.thread_local_constructors['processor'][1][1]
                    #     self.log.warning("prepare call scheduler_processor to stop scheduler")
                    #     processor.save_status()
                    break
                else:
                    self.log.info("wait tasks be consumed over, wait 5s")
                    time.sleep(5)

            self.beanstalk.__del__()  # 关闭连接不再接受数据
        except Exception as e:
            self.log.error("stop input_thread fail")
            self.log.exception(e)

    def run(self):
        job_num = 0
        while self.running and self.input_tube:
            try:
                job = self.beanstalk.reserve(self.input_tube, 30)
                if job is not None:
                    job_num += 1
                    body = job.body
                    job.delete()

                    self.process_pool.queue_task(self.__on_task_start, (body,), self.__on_task_finished)
                    task_num = self.process_pool.get_task_num()
                    if task_num >= 50:
                        self.log.info("place_processor\ttasks:%d" % task_num)
                        time.sleep(2)
                else:
                    self.log.info("not msg from:%s" % self.input_tube)
            except SocketError as e:
                time.sleep(30)
                self.log.error('beanstalk\tconnect\tfail\tstart\treconnect')
                self.log.exception(e)
                try:
                    self.beanstalk.reconnect()
                    self.out_beanstalk.reconnect()
                    self.log.error('beanstalk\treconnect\tsuccess')
                except Exception as e:
                    self.log.error('beanstalk\treconnect\tfail')
                    self.log.exception(e)
            except Exception as e:
                self.log.error("not msg from:%s\tresult:" % self.input_tube)
                self.log.exception(e)

    @staticmethod
    def __on_task_start(task, **thread_locals):
        result = None
        if 'profiler' in thread_locals:
            thread_locals['profiler'].begin()
        if 'processor' in thread_locals:
            result = thread_locals['processor'].do_task(task)
        return result

    def __on_task_finished(self, (result), **thread_locals):
        self.t_lock.acquire()
        proccesor = None
        if 'processor' in thread_locals:
            proccesor = thread_locals['processor']
        if 'profiler' in thread_locals:
            thread_locals['profiler'].end()
        if result and isinstance(result, basestring):
            self.__output_msg(result, proccesor)
        elif isinstance(result, list):
            for message in result:
                self.__output_msg(message, proccesor)
        self.t_lock.release()
Beispiel #14
0
host = '127.0.0.1'
port = 12400
server_thread_num = 1
server_process_num = 1

process_thread_num = 1

# beanstalk_conf = {
#     'host': '101.201.102.37',  # 线上beanstalk内网IP
#     'port': 11300,  # 线上beanstalk内网port
#     'input_tube': 'extract_to_crawl',
#     'output_tube': 'scheduler_info',
# }
beanstalk_conf = {
    'host': '127.0.0.1',  # 线上beanstalk内网IP
    'port': 11300,  # 线上beanstalk内网port
    'input_tube': 'extract_to_crawl',
    'output_tube': 'scheduler_info',
}

mysql_host = '101.201.102.37'
mysql_user = '******'
mysql_passwd = 'haizhi@)'
mysql_db = 'cmb_crawl'

download_output_tube = 'czj_download_rsp'

from i_util.logs import LogHandler

log = LogHandler("crawler_merge")
Beispiel #15
0
# -*- coding:utf-8    -*-
import threading
import traceback

import MySQLdb

from i_util.logs import LogHandler

log = LogHandler('redisdb')


class PyMySQL(object):

    FETCH_ONE = 0
    FETCH_MANY = 1
    FETCH_ALL = 2

    SELECT_SQL_FORMAT = "SELECT %s FROM `%s` WHERE %s"
    INSERT_SQL_FORMAT = "INSERT INTO `%s` (%s) VALUES (%s)"
    UPDATE_SQL_FORMAT = "UPDATE `%s` SET %s WHERE %s"
    DETELE_SQL_FORMAT = "DELETE FROM `%s` WHERE %s"

    def __init__(self, host, post, database, username, password):
        self.conn = None
        self.host = host
        self.post = post
        self.database = database
        self.username = username
        self.password = password
        self._connect()
Beispiel #16
0
        # signal.signal(signal.SIGINT, signal_process)
        # signal.signal(signal.SIGQUIT, signal_process)
        # signal.signal(signal.SIGUSR1, lambda a, b: profiling_signal_handler("downloader", a, b))

        file_path = 'downloader_smart_test.toml'
        opt, args = getopt.getopt(sys.argv[1:], 'f:', ['help'])
        for name, value in opt:
            if name == "-f":
                file_path = value
            elif name in ("-h", "--help"):
                usaget()
                sys.exit()
            else:
                assert False, "unhandled option"

        if file_path is None or file_path.strip() == '':
            raise Exception('配置文件路径错误..')

        with open(file_path, 'rb') as fp:
            config = pytoml.load(fp)
        logger_name = config["local_server"].get('name') + str(
            config["local_server"].get('port'))
        logger = LogHandler(logger_name)
        config['log'] = logger
        logger.info(logger_name)
        logger.info('开始启动服务....')
        main(config)
    except getopt.GetoptError:
        raise Exception(
            '参数读取错误...argv = {argv}'.format(argv=" ".join(sys.argv)))