import socket import Queue from crawlstuff import CommonHelper, TimeBomb from commonstuff import Config # config default_config = { "HOST": "127.0.0.1", "PORT": 5000, "RECV_BUFFER": 4096, # transfer close: beyond limit "TMP_DIR": "./tmp/", "QUEUE_FILE": "queue.pkl", "TIME_OUT": 60 # a option: parameter for select is TIMEOUT } CONFIG = Config('./', default_config) CONFIG.from_json('server.json') TAG_GET = 0 TAG_PUT = 1 class SQueue(object): """ queue server to listen request """ def __init__(self): super(SQueue, self).__init__() #create a socket self.server = socket.socket(socket.AF_INET,socket.SOCK_STREAM) self.server.setblocking(0) #set option reused self.server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR , 1) self.server.bind((CONFIG['HOST'], CONFIG['PORT'])) self.server.listen(10)
"MONGO_PORT": "32773", "RECV_BUFFER": 1024, "SERVER_BUFFER": 4096, "MAX_LINK_COUNT": 256, "PAGE_CACHE_SIZE": 5, "TMP_DIR": "./tmp/", "BLOOM_FILE": "bloom.pkl", "CRAWL_MAX_DEPTH": 999, "REQUEST_TIME": 5, "CRAWL_SCALE": ["m.byr.cn"], "CRAWL_SEEDS": ["http://m.byr.cn/section"], "THREADING_NUM": 3, "BLOOM_CAPACITY": 1000000 } CONFIG = Config('./', default_config) CONFIG.from_json('client.json') helper = CommonHelper() mongo_helper = MongoHelper(CONFIG['MONGO_HOST'], CONFIG['MONGO_PORT']) class UrlBloom: '''BloomFilter: check elements repetition''' def __init__(self, _capacity=1000000, _error_rate=0.00001): self.is_full = False # determine if open backup bloom data by time if CONFIG.get('BACKUP', 0) == 1: self.bomb = TimeBomb(CONFIG['TMP_DIR'] + CONFIG['BLOOM_FILE']) self.filter = self.bomb.load() if self.filter is None: self.filter = BloomFilter(capacity=_capacity, error_rate=_error_rate)