def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') proxy = Proxy() proxy.set_value( ip=inputs.get('ip'), port=inputs.get('port'), country=inputs.get('country', None), anonymity=inputs.get('anonymity', None), https=inputs.get('https', 'no'), speed=inputs.get('speed', -1), source=inputs.get('source', name), ) utils.sql_insert_proxy(sql, name, proxy) command = "SELECT ip FROM {0} WHERE ip={1} AND port={2}".format( name, inputs.get('ip'), inputs.get('port')) res = sql.query_one(command) return res is None except: pass return False
class BaseSpider(Spider): name = 'basespider' def __init__(self, *a, **kw): super(BaseSpider, self).__init__(*a, **kw) self.urls = [] self.headers = {} self.timeout = 10 self.sql = SqlHelper() self.dir_log = 'log/proxy/%s' % self.name self.is_record_web_page = False def init(self): self.meta = { 'download_timeout': self.timeout, } utils.make_dir(self.dir_log) command = utils.get_create_table_command(config.free_ipproxy_table) self.sql.execute(command) def start_requests(self): for i, url in enumerate(self.urls): yield Request( url=url, headers=self.headers, meta=self.meta, dont_filter=True, callback=self.parse_page, errback=self.error_parse, ) def parse_page(self, response): self.write(response.body) pass def error_parse(self, failure): request = failure.request pass def add_proxy(self, proxy): utils.sql_insert_proxy(self.sql, config.free_ipproxy_table, proxy) def write(self, data): if self.is_record_web_page: with open( '%s/%s.html' % (self.dir_log, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')), 'w') as f: f.write(data) f.close() def close(spider, reason): spider.sql.commit()
def __init__(self, red, key, user): self.key = key self.red = red data = json.loads(user) self.product_id = data.get('product_id') self.url = data.get('url') self.email = data.get('email') self.guid = data.get('guid') self.spider_name = 'tb_comment' self.spargs = data self.sql = SqlHelper() self.spargs['red'] = self.red self.spargs['sql'] = self.sql if not os.path.exists('log'): os.makedirs('log') configure_logging(install_root_handler = False) logging.basicConfig( filename = 'log/%s.log' % self.product_id, format = '%(levelname)s %(asctime)s: %(message)s', level = logging.DEBUG )
def _import_data_to_table(self, file, table_name, name_list, type_list): result = 0 # read file with open(file, newline='') as csvfile: csvreader = csv.reader(csvfile, delimiter=',', quotechar='"') firstLine = True for row in csvreader: # first row denotes table head if firstLine: firstLine = False elif len(row) == len(name_list): values = [] for i in range(len(row)): v = SqlHelper.type_to_dbstatement(row[i], type_list[i]) values.append(v) insert_query = SqlHelper.sql_insert_query( table_name, values) success = SqlHelper.execute_statement( self.connection, insert_query) result += int(success == True) else: print('ignoring unexpected row: {0}'.format(row)) return result
def _extract_name_and_types(file): name_list = [] type_list = [] # read file with open(file, newline='') as csvfile: csvreader = csv.reader(csvfile, delimiter=',', quotechar='"') firstLine = True for row in csvreader: # first row denotes table head if firstLine: firstLine = False for i in range(len(row)): name = SqlHelper.sanitize_colname(row[i]) name_list.append(name) type_list.append(DbType.NULL) elif len(row) == len(name_list): for i in range(len(row)): type_class = SqlHelper.classify_dbtype(row[i]) type_list[i] = DbType.order_max( type_list[i], type_class) return name_list, type_list
def __init__(self, *a , **kw): super(RecipeDetail, self).__init__(*a, **kw) self.dir_name = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_name)
def __init__(self, *a, **kw): super(GameUrls, self).__init__(*a, **kw) self.dir_game = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_game)
def __init__(self, *a, **kw): super(GameInfo, self).__init__(*a, **kw) self.dir_game = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_game) self.error_count = 0
def __init__(self, name=None, **kwargs): super(Validator, self).__init__(name, **kwargs) self.sql = SqlHelper() self.dir_log = 'log/validator/%s' % self.name self.timeout = 10 self.urls = [] self.headers = None self.success_mark = ''
def __init__(self, *a, **kw): super(BaseSpider, self).__init__(*a, **kw) self.urls = [] self.headers = {} self.timeout = 10 self.sql = SqlHelper() self.dir_log = 'log/proxy/%s' % self.name
def randitem(spargs): guid = spargs.get('guid', 0) utils.push_redis(guid, 0, '正在随机产生商品链接', save_to_mysql=False) url = 'https://diviner.taobao.com/diviner?p=610009&callback=jsonpCallbackMoreGood&lid=1&uuid=122270672' \ '.1492415671516609876050.1492415672.1492415672.1492415672.1&pin=&lim=100&ec=utf-8&_=1492415813682' headers = { 'Host': 'diviner.taobao.com', 'Referer': 'https://www.taobao.com/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0' } cookies = { '__jda': '122270672.1492415671516609876050.1492415672.1492415672.1492415672.1', '__jdb': '122270672.1.1492415671516609876050|1.1492415672', '__jdc': '122270672', '__jdv': '122270672|direct|-|none|-|1492415671524', '__jdu': '1492415671516609876050', } r = requests.get(url=url, headers=headers, cookies=cookies, timeout=20) pattern = re.compile('"sku":(\d+),', re.S) ids = re.findall(pattern, r.text) id = random.choice(ids) url = 'https://item.taobao.com/%s.html' % str(id) utils.push_redis(guid, 0, '生成商品链接:<a href="%s" target="_blank">%s' % (url, url), save_to_mysql=False) sql = SqlHelper() command = "SELECT id FROM {table} WHERE id={product_id}". \ format(table = config.tb_item_table, product_id = id) result = sql.query_one(command) # 如果数据库中没有,则重新抓取 if result == None: cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a product_id={product_id} -a url={url};'. \ format(url = str(url), name = 'tb', dir = settings.BASE_DIR, guid = guid, product_id = id) subprocess.Popen(cmd, shell=True) else: # 如果数据库中存在则,直接读取数据库中数据 command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \ format(config.analysis_item_table, id) result = sql.query(command) for res in result: utils.push_redis(guid, res[1], res[2], res[3], save_to_mysql=False)
def __init__(self): super(Crawler, self).__init__() self.album_prefix = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20={0}&page={1}' self.image_prefix = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id={0}&album_id={1}&page={2}' self.image_pattern = re.compile('''img.*290x10000.jpg''', re.U) self.image_name_pattern = re.compile('''"picId":"(.*?)"''', re.U) self.model_pattern = re.compile( '''<a class="lady-name" href="(.*?)".*>(.*?)</a>''', re.U) self.album_pattern = re.compile('''.*album_id=(.*?)&.*''', re.U) self.links = [] self.ids = [] self.names = [] self.sql = SqlHelper()
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') ip = inputs.get('ip') command = "DELETE FROM {0} WHERE ip=\'{1}\'".format(name, ip) sql.execute(command) command = "SELECT ip FROM {0} WHERE ip=\'{1}\'".format(name, ip) res = sql.query_one(command) return res is None except: pass return False
def _create_table(self, file, indices): table_name = SqlHelper.get_tablename(file) result_create = 0 result_data = 0 result_key = 0 # create table in database name_list, type_list = SqlImporter._extract_name_and_types(file) if len(name_list) > 0: create_query = SqlHelper.sql_create_table(table_name, name_list, type_list) create_success = SqlHelper.execute_statement( self.connection, create_query) self.connection.commit() if create_success: result_create = 1 # import data into table result_data = self._import_data_to_table(file, table_name, name_list, type_list) self.connection.commit() # attempt to set primary key in first integer-type column primary_key = -1 for i in range(len(type_list)): t = type_list[i] if t == DbType.INTEGER: success = self._set_primarykey(table_name, i, name_list, type_list) self.connection.commit() if success: result_key = 1 primary_key = i break # set indices on _id named columns if specified if indices: for i in range(len(type_list)): t = type_list[i] if i != primary_key and t == DbType.INTEGER and ( name_list[i] == 'id' or '_id_' in name_list[i] or name_list[i].endswith('_id')): self._set_index(table_name, i, name_list, type_list) return result_create, result_data, result_key
def _search_foods_by_nutrient(connection, nutrient, food_name): script = [] rand_id = SqlHelper._get_rand_tableno() for l in sql_script_food_descr_nutrient: script.append(l.format(rand_id, nutrient, food_name, 'ASC')) result_index = 1 return connection.queries(script, result_index)
def __init__(self, name=None, **kwargs): super(JDSpider, self).__init__(name, **kwargs) self.product_id = kwargs.get('product_id', -1) self.log('product_id:%s' % self.product_id) self.item_table = 'item_%s' % self.product_id self.product_page = '%s_page' % self.product_id self.log_dir = 'log/%s' % self.product_id self.is_record_page = False if self.is_record_page: utils.make_dir(self.log_dir) self.sql = SqlHelper() self.red = redis.StrictRedis(host=config.redis_host, port=config.redis_part, db=config.redis_db, password=config.redis_pass)
def _search_nutrients_by_fdcid(connection, fdc_id): script = [] rand_id = SqlHelper._get_rand_tableno() for l in sql_script_nutrition_list_for_fdcid: script.append(l.format(rand_id, fdc_id)) result_index = 1 return _get_nutrient_list(connection.queries(script, result_index))
class SendSms(object): def __init__(self): self.sql = SqlHelper() self.weather_table_name = config.weather_table self.user_table_name = config.user_table def send_sms(self): command = ("SELECT * FROM {};".format(self.user_table_name)) self.sql.execute(command) users = self.sql.cursor.fetchall() if users != None: for user in users: utils.log('send_sms get user info user:%s' % str(user)) # 判断用户定义的时间,只有满足用户定义时间才发送短信 user_time = user[5] time_info = user_time.split(':') u_hour = time_info[0] u_minute = time_info[1] # 获取系统时间 s_hour = datetime.datetime.now().hour s_minute = datetime.datetime.now().minute if int(u_hour) == s_hour and int(u_minute) == s_minute: utils.log('send sms to user:%s' % str(user)) command = ( "select * from {0} where city_name='{1}' order by id desc limit 1;" .format(self.weather_table_name, user[3])) self.sql.execute(command) weather = self.sql.cursor.fetchone() if weather != None: temp_code = 'SMS_41855112' phone = user[2] info = { 'name': user[1], 'city': user[3], 'weather': weather[15], 'temp': '%s ~ %s' % (weather[9], weather[8]), 'aqilevel': utils.get_aqi_level_info(weather[12]), } sms = AliyunSms() sms.send_sms(temp_code, info, phone)
def _print_import_result(status, file): print('table: ' + SqlHelper.get_tablename(file)) if status[0] > 0: print('- created') if status[1] > 0: print('- filled') if status[2] > 0: print('- primary key detected') print('---')
def _set_primarykey(self, table_name, primary_key, col_list=[], type_list=[]): sql_stm = 'ALTER TABLE {0} ADD PRIMARY KEY ({1});'.format( table_name, primary_key) return SqlHelper.execute_statement(self.connection, sql_stm)
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') command = "SELECT * FROM {0}".format(name) result = sql.query(command) data = [{ 'ip': item[1], 'port': item[2], 'speed': item[6] } for item in result] data = json.dumps(data, indent=4) return data except: pass return []
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') anonymity = inputs.get('anonymity', None) https = inputs.get('https', None) order = inputs.get('order', 'speed') sort = inputs.get('sort', 'asc') count = inputs.get('count', 100) command = '' if anonymity is None and https is None: command = "SELECT * FROM {name} ORDER BY {order} {sort} LIMIT {count}". \ format(name = name, order = order, sort = sort, count = count) elif anonymity is not None and https is None: command = "SELECT * FROM {name} WHERE anonymity=\'{anonymity}\' ORDER BY {order} {sort} " \ "LIMIT {count}". \ format(name = name, anonymity = anonymity, order = order, sort = sort, count = count) elif anonymity is None and https is not None: command = "SELECT * FROM {name} WHERE https=\'{https}\' ORDER BY {order} {sort} LIMIT {count}". \ format(name = name, https = https, order = order, sort = sort, count = count) elif anonymity is not None and https is not None: command = "SELECT * FROM {name} WHERE anonymity=\'{anonymity}\' AND https=\'{https}\' ORDER BY " \ "{order} {sort} limit {count}". \ format(name = name, anonymity = anonymity, https = https, order = order, sort = sort, count = count) result = sql.query(command) data = [{ 'id': item[0], 'ip': item[1], 'port': item[2], 'anonymity': item[4], 'https': item[5], 'speed': item[6], 'save_time': str(item[8]) } for item in result] data = json.dumps(data, indent=4) return data except Exception, e: utils.log('select exception msg:%s' % e) pass
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') anonymity = inputs.get('anonymity', None) https = inputs.get('https', None) sort = inputs.get('sort', 'speed') count = inputs.get('count', 100) command = '' if anonymity is None and https is None: command = "SELECT * FROM {0} ORDER BY {1} LIMIT {2}".format( name, sort, count) elif anonymity is not None and https is None: command = "SELECT * FROM {0} WHERE anonymity=\'{1}\' ORDER BY {2} LIMIT {3}". \ format(name, anonymity, sort, count) elif anonymity is None and https is not None: command = "SELECT * FROM {0} WHERE https=\'{1}\' ORDER BY {2} LIMIT {3}". \ format(name, https, sort, count) elif anonymity is not None and https is not None: command = "SELECT * FROM {0} WHERE anonymity=\'{1}\' AND https=\'{2}\' ORDER BY {3} limit {4}". \ format(name, anonymity, https, sort, count) result = sql.query(command) data = [{ 'ip': item[1], 'port': item[2], 'speed': item[6] } for item in result] data = json.dumps(data, indent=4) return data except: pass return []
def handle(self, *args, **options): reload(sys) sys.setdefaultencoding('utf-8') os.chdir(sys.path[0]) spargs = utils.arglist_to_dict(options['spargs']) if not os.path.exists('log'): os.makedirs('log') configure_logging(install_root_handler=False) logging.basicConfig(filename='log/%s.log' % spargs.get('user_id'), format='%(levelname)s %(asctime)s: %(message)s', level=logging.ERROR) guid = spargs.get('guid', '0') user_id = spargs.get('user_id', '0') logging.warn('user_id') if guid == '0' or user_id == '0': utils.log('分析数据传入参数不对,接收到的参数为: spargs:%s' % spargs) utils.push_redis(guid=guid, user_id=user_id, info='分析数据传入参数不对,接收到的参数为:%s' % spargs) utils.push_redis(guid=guid, user_id=user_id, info='finish') return utils.log('开始分析:%s' % spargs) sql = SqlHelper() red = redis.StrictRedis(host=config.redis_host, port=config.redis_part, db=config.redis_db, password=config.redis_pass) spargs['sql'] = sql spargs['red'] = red # 运行爬虫 logging.warn(spargs) runspider(spargs) # 开启分析 logging.warn(spargs) analysis = RealTimeAnalysis(**spargs) analysis.run()
def __init__(self, *a, **kwargs): super(AssetStoreSpider, self).__init__(*a, **kwargs) # 存储插件下载的目录 self.dir_plugins = 'Plugins/' self.dir_all = self.dir_plugins + 'all' utils.make_dir(self.dir_plugins) utils.make_dir(self.dir_all) # 所有插件的一个列表 self.plugin_list = [] self.sql = SqlHelper() self.table_name = config.assetstore_table_name self.priority_adjust = 2 # unity 的版本 self.unity_version = '' # 请求 header self.headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Host': 'www.assetstore.unity3d.com', 'Referer': 'https://www.assetstore.unity3d.com/en/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0', 'X-Kharma-Version': self.unity_version, 'X-Requested-With': 'UnityAssetStore', 'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41', } self.init()
if __name__ == '__main__': if not os.path.exists('log'): os.makedirs('log') if not os.path.exists('temp'): os.makedirs('temp') reload(sys) sys.setdefaultencoding('utf-8') logging.basicConfig(filename='log/job.log', format='%(levelname)s %(asctime)s: %(message)s', level=logging.DEBUG) sql = SqlHelper() red = redis.StrictRedis(host='localhost', port=6379, db=10) init() wx = MyWXBot() t1 = threading.Thread(target=wx.run_wx) t2 = threading.Thread(target=wx.user_query_job) t3 = threading.Thread(target=wx.crawl_boss_job) t4 = threading.Thread(target=wx.crawl_lagou_job) t5 = threading.Thread(target=wx.crawl_liepin_job) t1.start() t2.start() t3.start() t4.start() t5.start()
class GameUrls(Spider): name = 'game_urls' start_urls = [ 'http://store.steampowered.com/search/?sort_by=Released_DESC&page=%s' % n for n in range(1, 1058) ] def __init__(self, *a, **kw): super(GameUrls, self).__init__(*a, **kw) self.dir_game = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_game) def init(self): command = ("CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`type` CHAR(10) NOT NULL," "`name` TEXT NOT NULL," "`url` TEXT NOT NULL," "`is_crawled` CHAR(5) DEFAULT 'no'," "`page` INT(5) NOT NULL ," "PRIMARY KEY(id)" ") ENGINE=InnoDB".format(config.steam_game_urls_table)) self.sql.create_table(command) def start_requests(self): for i, url in enumerate(self.start_urls): yield Request( url=url, headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Host': 'store.steampowered.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 ' 'Firefox/51.0', }, meta={ 'url': url, 'page': i + 1, }, dont_filter=True, callback=self.parse_all, errback=self.error_parse, ) def parse_all(self, response): # file_name = '%s/%s.html' % (self.dir_game, response.meta.get('page')) # self.save_page(file_name, response.body) self.log('parse_all url:%s' % response.url) game_list = response.xpath( '//div[@id="search_result_container"]/div[2]/a').extract() count = 0 for game in game_list: sel = Selector(text=game) url = sel.xpath('//@href').extract_first() id, type = self.get_id(url) # id = sel.xpath('//@data-ds-appid').extract_first() name = sel.xpath( '//div[@class="col search_name ellipsis"]/span/text()' ).extract_first() msg = (None, type, name, url, 'no', response.meta.get('page')) command = ("INSERT IGNORE INTO {} " "(id, type, name, url, is_crawled, page)" "VALUES(%s, %s, %s, %s, %s, %s)".format( config.steam_game_urls_table)) self.sql.insert_data(command, msg) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def get_id(self, url): type = '' if '/sub/' in url: pattern = re.compile('/sub/(\d+)/') type = 'sub' elif '/app/' in url: pattern = re.compile('/app/(\d+)/', re.S) type = 'app' elif '/bundle/' in url: pattern = re.compile('/bundle/(\d+)/', re.S) type = 'bundle' else: pattern = re.compile('/(\d+)/', re.S) type = 'other' utils.log('get_id other url:%s' % url) id = re.search(pattern, url) if id: id = id.group(1) return id, type utils.log('get_id error url:%s' % url) return 0, 'error' def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()
class Validator(Spider): name = 'base' concurrent_requests = 16 retry_enabled = False def __init__(self, name=None, **kwargs): super(Validator, self).__init__(name, **kwargs) self.sql = SqlHelper() self.dir_log = 'log/validator/%s' % self.name self.timeout = 10 self.urls = [] self.headers = None self.success_mark = '' self.is_record_web_page = False def init(self): utils.make_dir(self.dir_log) command = utils.get_create_table_command(self.name) self.sql.create_table(command) @classmethod def update_settings(cls, settings): settings.setdict(cls.custom_settings or { 'CONCURRENT_REQUESTS': cls.concurrent_requests, 'RETRY_ENABLED': cls.retry_enabled, }, priority='spider') def start_requests(self): count = utils.get_table_length(self.sql, self.name) count_free = utils.get_table_length(self.sql, config.httpbin_table) ids = utils.get_table_ids(self.sql, self.name) ids_free = utils.get_table_ids(self.sql, config.httpbin_table) for i in range(0, count + count_free): table = self.name if (i < count) else config.httpbin_table id = ids[i] if i < count else ids_free[i - len(ids)] proxy = utils.get_proxy_info(self.sql, table, id) if proxy == None: continue for url in self.urls: cur_time = time.time() yield Request( url=url, headers=self.headers, meta={ 'cur_time': cur_time, 'download_timeout': self.timeout, 'proxy_info': proxy, 'table': table, 'id': proxy.get('id'), 'proxy': 'http://%s:%s' % (proxy.get('ip'), proxy.get('port')), 'vali_count': proxy.get('vali_count', 0) }, dont_filter=True, callback=self.success_parse, errback=self.error_parse, ) def success_parse(self, response): utils.log('success_parse speed:%s meta:%s' % (time.time() - response.meta.get('cur_time'), response.meta)) proxy = response.meta.get('proxy_info') table = response.meta.get('table') id = response.meta.get('id') ip = proxy.get('ip') self.save_page(ip, response.body) if self.success_mark in response.body or self.success_mark is '': speed = time.time() - response.meta.get('cur_time') if table == self.name: if speed > self.timeout: command = utils.get_delete_data_command(table, id) self.sql.execute(command) else: vali_count = response.meta.get('vali_count', 0) + 1 command = utils.get_update_data_command( table, id, speed, vali_count) self.sql.execute(command) else: if speed < self.timeout: command = utils.get_insert_data_command(self.name) msg = (None, proxy.get('ip'), proxy.get('port'), proxy.get('country'), proxy.get('anonymity'), proxy.get('https'), speed, proxy.get('source'), None, 1) self.sql.insert_data(command, msg, commit=True) else: # 如果没有找到成功标示,说明这里返回信息有误,需要删除当前库的 ip if table == self.name: command = utils.get_delete_data_command(table, id) self.sql.execute(command) def error_parse(self, failure): request = failure.request utils.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta)) proxy = failure.request.meta.get('proxy_info') table = failure.request.meta.get('table') id = failure.request.meta.get('id') if table == self.name: command = utils.get_delete_data_command(table, id) self.sql.execute(command) else: # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理 pass # # request = failure.request.meta # utils.log('request meta:%s' % str(request)) # # # log all errback failures, # # in case you want to do something special for some errors, # # you may need the failure's type # self.logger.error(repr(failure)) # # #if isinstance(failure.value, HttpError): # if failure.check(HttpError): # # you can get the response # response = failure.value.response # self.logger.error('HttpError on %s', response.url) # # #elif isinstance(failure.value, DNSLookupError): # elif failure.check(DNSLookupError): # # this is the original request # request = failure.request # self.logger.error('DNSLookupError on %s', request.url) # # #elif isinstance(failure.value, TimeoutError): # elif failure.check(TimeoutError): # request = failure.request # self.logger.error('TimeoutError on url:%s', request.url) def save_page(self, ip, data): filename = '{time} {ip}'.format( time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'), ip=ip) utils.log('filename:%s' % filename) if self.is_record_web_page: with open('%s/%s.html' % (self.dir_log, filename), 'w') as f: f.write(data) f.close() def close(spider, reason): spider.sql.commit()
class RecipeDetail(CrawlSpider): name = "recipe_detail" base_url = 'https://www.xiachufang.com' header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Connection': 'keep-alive', 'Host': 'www.xiachufang.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', } def __init__(self, *a , **kw): super(RecipeDetail, self).__init__(*a, **kw) self.dir_name = 'log/%s' % self.name self.sql = SqlHelper() self.init() utils.make_dir(self.dir_name) def init(self): command = ( "CREATE TABLE IF NOT EXISTS {} (" "`id` INT(8) NOT NULL AUTO_INCREMENT," "`name` CHAR(20) NOT NULL COMMENT 'recipe name'," "`recipe_id` INT(12) NOT NULL COMMENT 'recipe ID'," "`source_name` CHAR(20) NOT NULL COMMENT 'source name'," "`source_id` INT(8) NOT NULL COMMENT 'source ID'," "`create_time` DATETIME NOT NULL," "PRIMARY KEY(id)" ") ENGINE=InnoDB".format(config.item_detail_table) ) self.sql.create_table(command) def start_requests(self): command = "SELECT * from {}".format(config.item_list_table) data = self.sql.query(command) for i, recipe in enumerate(data): if recipe[0] > 8999 and recipe[0] < 10000: url = self.base_url + recipe[2] utils.log(url) yield Request( url = url, headers = self.header, callback = self.parse_all, errback = self.error_parse, meta={"re_id": recipe[3], "re_name": recipe[1]}, dont_filter = True, ) def parse_all(self, response): utils.log(response.url) if response.status == 429: raise CloseSpider('Too much request, IP banned') if response.status == 200: file_name = '%s/recipe.html' % (self.dir_name) self.save_page(file_name, response.body) sources = response.xpath("//div[@class='ings']//tr").extract() for source in sources: sel = Selector(text = source) source_name = sel.xpath("//a/text()").extract_first() url = sel.xpath("//a/@href").extract_first() if source_name is not None and url is not None: source_id = url.split('/')[-2] r_name = response.meta["re_name"] r_id = response.meta["re_id"] dt = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") msg = (None, r_name, r_id, source_name, source_id, dt) command = ("INSERT IGNORE INTO {} " "(id, name, recipe_id, source_name, source_id, create_time)" "VALUES(%s,%s,%s,%s,%s,%s)".format(config.item_detail_table) ) self.sql.insert_data(command, msg) def error_parse(self, faiture): request = faiture.request utils.log('error_parse url:%s meta:%s' % (request.url, request.meta)) def save_page(self, file_name, data): with open(file_name, 'w') as f: f.write(data) f.close()
#-*- coding: utf-8 -*- import sys import time from selenium import webdriver from scrapy import Selector from sqlhelper import SqlHelper sql = SqlHelper() # command = ( # "CREATE TABLE IF NOT EXISTS {} (" # "`id` CHAR(10) NOT NULL UNIQUE," # "`name` CHAR(10) NOT NULL," # "PRIMARY KEY(name)" # ") ENGINE=InnoDB".format('liepin_city_id')) # sql.create_table(command) # # reload(sys) # sys.setdefaultencoding('utf-8') # # url = 'https://www.liepin.com/zhaopin/?sfrom=click-pc_homepage-centre_searchbox-search_new&key=python' # # driver = webdriver.PhantomJS() # driver.get(url = url) # driver.save_screenshot('liepin.png') # with open('liepin.html', 'w') as f: # f.write(driver.page_source) # f.close() #