def __init__(self, *args, **kwargs): self.areas = [ {"name": "heping", "page": 0, "now": 1}, {"name": "nankai", "page": 0, "now": 1}, {"name": "hexi", "page": 0, "now": 1}, {"name": "hebei", "page": 0, "now": 1}, {"name": "hedong", "page": 0, "now": 1}, {"name": "hongqiao", "page": 0, "now": 1}, {"name": "xiqing", "page": 0, "now": 1}, {"name": "beichen", "page": 0, "now": 1}, {"name": "dongli", "page": 0, "now": 1}, {"name": "jinnan", "page": 0, "now": 1}, {"name": "tanggu", "page": 0, "now": 1}, {"name": "kaifaqu", "page": 0, "now": 1}, {"name": "diyidajie", "page": 0, "now": 1}, {"name": "dierdajie", "page": 0, "now": 1}, {"name": "disandajie", "page": 0, "now": 1}, {"name": "disidajie", "page": 0, "now": 1}, {"name": "diwudajie", "page": 0, "now": 1} ] self.area_now = 0 self.util = util() self.list_url = 'https://tj.lianjia.com/ershoufang/{area}/pg{page}/' self.r = redis.Redis(host=settings.REDIS['host'], port=settings.REDIS['port']) super(CrawlLianjiaSpider, self).__init__(*args, **kwargs) if kwargs and "page" in kwargs: try: self.page_all = int(kwargs['page']) except TypeError, e: self.page_all = 100
def __init__(self): self.util = util() self.r = redis.Redis(host=settings.REDIS['host'], port=settings.REDIS['port']) self.property_map = { "房屋户型": 'layout', "所在楼层": 'flood', "建筑面积": 'area', "户型结构": 'apartment_structure', "建筑类型": 'building_type', "装修情况": 'renovation', "梯户比例": 'ladder', "供暖方式": 'heating', "产权年限": 'property_term', "交易权属": 'ownership', "挂牌时间": 'list_time', "上次交易": 'last_trade', "房屋朝向": 'direction', "房屋用途": 'purpose', "房屋年限": 'hold_years', "抵押信息": 'mortgage', "房本备件": 'house_register', "核心卖点": 'core_point', "周边配套": 'periphery', "交通出行": 'traffic', "小区介绍": 'residential_desc', "户型介绍": 'layout_desc', "配备电梯": 'elevator' }
def __init__(self): self.util = util() self.verify_tool = Verify() self.r = redis.Redis(host='127.0.0.1') self.login_url = 'https://cas.baidu.com/?action=login' self.verify_url = 'http://cas.baidu.com/?action=image&key={rand}' self.page_now = 1 self.max_page = 51 self.verify_save_name = 'verify.jpg' self.sites_map = { '8918649': {'name': 'm.91pme.com', 'page_now': 1, 'time': None}, '7802984': {'name': '91pme.com', 'page_now': 1, 'time': None}, '8918810': {'name': 'mm.91pme.com', 'page_now': 1, 'time': None} } self.formdata = { "siteId": "7802984", "order": "start_time,desc", "offset": "0", "pageSize": "100", "tab": "visit", "timeSpan": "14", "indicators": "start_time,area,source,access_page,searchword,visitorId,ip,visit_time,visit_pages", "reportId": "4", "method": "trend/latest/a", "queryId": "" } self.lastest_access_time = {}
def __init__(self, *args, **kwargs): self.util = util() self.url_format = 'http://brokers.fx678.com/articlelist/{id}/{page}' self.type_index = 0 self.page_index = 1 self.max_page = 1 super(CrawlFx678ArticleSpider, self).__init__(*args, **kwargs) if kwargs and "max" in kwargs: self.max_page = int(kwargs['max'])
def __init__(self, *args, **kwargs): self.util = util() date_diff = datetime.timedelta(days=1) self.date_end = self.date_now + date_diff super(CrawlFx678CalendarSpider, self).__init__(*args, **kwargs) if 'args' in kwargs: params = { x[0]: x[1] for x in [[l for l in m.split(":")] for m in kwargs['args'].split(",")] } if "start" in params: try: date_pat = re.compile(r"\d{4}\-\d{2}\-\d{2}") if len(date_pat.findall(params['start'])) == 0: timedelta = datetime.timedelta( days=int(params['start'])) date_start = datetime.datetime.now() + timedelta else: date_start = datetime.datetime.strptime( params['start'], "%Y-%m-%d") self.date_now = date_start except ValueError as error: print params['start'] + ' 不是正确格式的时间,已默认从今天开始抓取' if "max" in params: try: self.max_days = int(params['max']) except ValueError as err: print params['max'] + ' 不是正确的抓取天数,已默认抓取全部数据' if "after" in params: try: self.after_days = int(params['after']) except ValueError as err: print params['after'] + ' 不是正确的向后抓取天数,已默认抓取今天之后60天的数据' if "jiedu" in params: self.jiedu = params['jiedu'] if self.max_days is not None: date_diff = datetime.timedelta(days=int(self.max_days)) self.date_end = self.date_now + date_diff else: date_diff = datetime.timedelta(days=int(self.after_days)) self.date_end = datetime.datetime.now() + date_diff
def __init__(self): self.all = [] self.data_name = 'baidu_rate_%d' % int(time.time()) self.util = util() self.verify_tool = Verify() self.verify_save_name = 'verify.jpg' self.r = redis.Redis(host='127.0.0.1') self.login_url = 'https://cas.baidu.com/?action=login' self.verify_url = 'http://cas.baidu.com/?action=image&key={rand}' self.page_now = 0 self.sites_map = { '8918649': { 'name': 'm.91pme.com', 'page_now': 1 }, '7802984': { 'name': '91pme.com', 'page_now': 1 }, } self.formdata = { "productId": "fcWord,0", "fcPlanId": "-1", "fcUnitId": "-1", "siteId": "8918649", "st": "1512489600000", "et": "1512489600000", "indicators": "", "order": "bounce_ratio,desc", "offset": "0", "target": "-1", "flag": "fcWord", "userId": "0", "fcWordType": "fcSearchWord", "clientDevice": "all", "reportId": "6", "method": "pro/product/a", "queryId": "" } self.indicators = [ "show_count", "clk_count", "cost_count", "ctr", "cpm", "pv_count", "visit_count", "visitor_count", "new_visitor_count", "new_visitor_ratio", "in_visit_count", "bounce_ratio", "avg_visit_time", "avg_visit_pages", "arrival_ratio", "trans_count", "trans_ratio", "avg_trans_cost", "income", "profit", "roi" ]
def __init__(self, *args, **kwargs): super(CrawlWeixinSearchSpider, self).__init__(*args, **kwargs) self.util = util() self.r = redis.Redis(host=REDIS['host'], port=REDIS['port']) self.page_url = "http://weixin.sogou.com/weixin?usip=&query={query}&ft=&tsn=1&et=&interation=&type=2&wxid=&page={page}&ie=utf8" self.type_index = 0 self.type = [{'name': '五常大米', 'page_now': 1, 'page_all': 1}] self.type_now = self.type[0] self.only_hot = False self.typename = self.type_now['name'] self.referer = "http://weixin.sogou.com/weixin?type=2&s_from=input&query={query}&ie=utf8&_sug_=y&_sug_type_=&w=01019900&sut=10939&sst0={time}&lkt=6%2C1513059170545%2C1513059180409" if 'args' in kwargs: params = { x[0]: x[1] for x in [[l for l in m.split(":")] for m in kwargs['args'].split(",")] } if "hot" in params: self.only_hot = True print "Only crawl hot keywords"
def __init__(self): self.base_url = "https://tj.lianjia.com/ershoufang/housestat?hid={house_id}&rid={residential_id}" self.r = redis.Redis(host=REDIS['host']) self.util = util() self.residential_id = None
def __init__(self): self.util = util() self.r = redis.Redis(host='127.0.0.1')
def __init__(self): self.util = util() self.r = redis.Redis(host="127.0.0.1", port=6379, db=0) self.item_index = 0
def __init__(self): self.base_url = "https://d.weibo.com/p/aj/v6/mblog/mbloglist?ajwvr=6&domain=102803_ctg1_1760_-_ctg1_1760&pagebar=0&tab=home¤t_page={current_page}&pre_page={pre_page}&page={page}&pl_name=Pl_Core_NewMixFeed__3&id=102803_ctg1_1760_-_ctg1_1760&script_uri=/&feed_type=1&domain_op=102803_ctg1_1760_-_ctg1_1760&__rnd={time}" self.page = self.current_page = self.pre_page = 1 self.util = util() self.login_time = 0 self.login_cmd = CmdWeiboLogin()
def __init__(self): self.util = util()
def __init__(self, *args, **kwargs): self.util = util() super(CrawlJin10ArticleSpider, self).__init__(*args, **kwargs) if kwargs and "all" in kwargs: self.crawl_all_page = bool(kwargs['all'])
def __init__(self): self.util = util() self.house_id = None self.baseurl = 'https://tj.lianjia.com/ershoufang/houseseerecord?id={id}' self.r = redis.Redis(host=settings.REDIS['host'], port=settings.REDIS['port'])