Esempio n. 1
0
import socket
import Queue
from crawlstuff import CommonHelper, TimeBomb
from commonstuff import Config

# config
default_config = {
    "HOST":         "127.0.0.1",
    "PORT":         5000,
    "RECV_BUFFER":  4096,     # transfer close: beyond limit
    "TMP_DIR":      "./tmp/",
    "QUEUE_FILE":   "queue.pkl",
    "TIME_OUT":      60           # a option: parameter for select is TIMEOUT
}

CONFIG = Config('./', default_config)
CONFIG.from_json('server.json')
TAG_GET = 0
TAG_PUT = 1

class SQueue(object):
    """ queue server to listen request """
    def __init__(self):
        super(SQueue, self).__init__()
        #create a socket
        self.server = socket.socket(socket.AF_INET,socket.SOCK_STREAM)
        self.server.setblocking(0)
        #set option reused
        self.server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR  , 1)
        self.server.bind((CONFIG['HOST'], CONFIG['PORT']))
        self.server.listen(10)
Esempio n. 2
0
    "MONGO_PORT":         "32773",
    "RECV_BUFFER":        1024,
    "SERVER_BUFFER":      4096,
    "MAX_LINK_COUNT":     256,
    "PAGE_CACHE_SIZE":    5,
    "TMP_DIR":            "./tmp/",
    "BLOOM_FILE":         "bloom.pkl",
    "CRAWL_MAX_DEPTH":    999,
    "REQUEST_TIME":       5,
    "CRAWL_SCALE":        ["m.byr.cn"],
    "CRAWL_SEEDS":        ["http://m.byr.cn/section"],
    "THREADING_NUM":      3,
    "BLOOM_CAPACITY":     1000000
}

CONFIG = Config('./', default_config)
CONFIG.from_json('client.json')
helper = CommonHelper()
mongo_helper = MongoHelper(CONFIG['MONGO_HOST'], CONFIG['MONGO_PORT'])


class UrlBloom:
    '''BloomFilter: check elements repetition'''
    def __init__(self, _capacity=1000000, _error_rate=0.00001):
        self.is_full = False
        # determine if open backup bloom data by time
        if CONFIG.get('BACKUP', 0) == 1:
            self.bomb = TimeBomb(CONFIG['TMP_DIR'] + CONFIG['BLOOM_FILE'])
            self.filter = self.bomb.load()
            if self.filter is None:
                self.filter = BloomFilter(capacity=_capacity, error_rate=_error_rate)