def determine_level(self, response): """ determine the index level of current response, so we can decide wether to continue crawl or not. level 1: people/[a-z].html level 2: people/[A-Z][\d+].html level 3: people/[a-zA-Z0-9-]+.html level 4: search page, pub/dir/.+ level 5: profile page """ import re url = response.url if re.match(".+/[a-z]\.html", url): return 1 elif re.match(".+/[A-Z]\d+.html", url): return 2 elif re.match(".+/people-[a-zA-Z0-9-]+", url): return 3 elif re.match(".+/pub/dir/.+", url): return 4 elif re.match(".+/search/._", url): return 4 elif re.match(".+/pub/.+", url): return 5 log.msg("Crawl cannot determine the url's level: " + url) return None
def read_datas(row): data = { 'partNumber': row['件号'.encode('utf-8')], 'name': row['名称'.encode('utf-8')], 'category': row['类别'.encode('utf-8')], # 'minStock': row['最低库存'.encode('utf-8')], 'unit': row['单位'.encode('utf-8')], 'applicableModel': "运5B(D)", } cate_list = ['一般航材', '工装设备', '消耗品', '化工品', '时控件', '时寿件'] if not data['name'] or not data['partNumber']: logging.warn("件号或名称没有。 件号:%s,名称:%s" % (data['partNumber'], data['name'].decode("utf-8"))) return None if data['category'].decode("utf-8") not in cate_list: logging.warn("%s的航材类别有误." % data['name'].decode("utf-8")) return None if row['最低库存'.encode('utf-8')]: if row['最低库存'.encode('utf-8')] <= 0: logging.msg("航材(%s)的最低库存应大于0" % data['name'].decode("utf-8")) return None data['minStock'] = int(row['最低库存'.encode('utf-8')]) data['statusName'] = data['auditStatus'] = InitialState return data
def parsePage(self, response): sel = Selector(response) try: try: # Number of pages per hotel page_list = len( sel.xpath( '/html/body/div/div/div/div[4]/div/div[1]/text()')) page = str( sel.xpath('/html/body/div/div/div/div[4]/div/div[1]/a[' + str(page_list - 2) + ']/span/text()')).split(' ')[2].split('\'')[1] print "page_list!!!" print page_list - 2 print page except: page = 0 for key in range(0, int(page)): link = response.url.replace( urlparse(response.url)[4].split('&')[4], 'currentPage=' + str(key)) print urlparse(response.url)[4].split('&')[4] yield Request(url=link, callback=self.parseReview) print 'sleep 5 secs' time.sleep(5) except: log.msg("Page Error !!!!! " + response.url, level=log.WARNING)
def parse(self, response): """ default parse method, rule is not useful now """ # import pdb; pdb.set_trace() response = response.replace( url=HtmlParser.remove_url_parameter(response.url)) hxs = HtmlXPathSelector(response) index_level = self.determine_level(response) log.msg("Parse: index level:" + str(index_level)) if index_level in [1, 2, 3, 4]: self.save_to_file_system(index_level, response) relative_urls = self.get_follow_links(index_level, hxs) if relative_urls is not None: for url in relative_urls: log.msg('yield process, url:' + url) yield Request(url, callback=self.parse) elif index_level == 5: personProfile = HtmlParser.extract_person_profile(hxs) linkedin_id = self.get_linkedin_id(response.url) linkedin_id = UnicodeDammit( urllib.unquote_plus(linkedin_id)).markup if linkedin_id: personProfile['_id'] = linkedin_id personProfile['url'] = UnicodeDammit(response.url).markup yield personProfile
def parsePage(self, response): sel = Selector(response) try: try: # Number of pages per hotel page_list = len( sel.xpath( '/html/body/div/div/div/div[4]/div/div[1]/text()')) page = str( sel.xpath('/html/body/div/div/div/div[4]/div/div[1]/a[' + str(page_list - 2) + ']/span/text()')).split(' ')[2].split('\'')[1] print "page_list!!!" print page_list - 2 print page if int(page) >= 74: print "get!" check.append(response.meta['hotel']) con = json.dumps(check, ensure_ascii=False).encode('utf8') print con f = open('check', 'r+') f.write(con) except: page = 0 print "let page == 0" print 'sleep 5 secs' time.sleep(5) except: log.msg("Page Error !!!!! " + response.url, level=log.WARNING)
def process_request(self, request, spider): # TODO implement complex proxy providing algorithm if self.use_proxy(request): p = random.choice(PROXIES) try: request.meta['proxy'] = "http://%s" % p['ip_port'] except Exception, e: logging.msg("Exception %s" % e, _level=logging.CRITICAL)
def process_exception(self, request, exception, spider): # print '----',exception # print request.meta['handle_httpstatus_all'] # print dir(request) # if 'proxy' in request.meta.keys(): proxy = request.meta['proxy'] log.msg('message:%s,url:(%s),failed proxy <%s>' % (exception.message, request, proxy))
def process_request(self, request, spider): # TODO implement complex proxy providing algorithm if self.use_proxy(request): p = random.choice(PROXIES) try: request.meta['proxy'] = "http://%s" % p['ip_port'] except Exception, e: log.msg("Exception %s" % e, _level=log.CRITICAL)
def handle(self, signum, frame): self.exitCode = const.INVALID_EXIT + signum msg = "Received signal %s: '%s'; exiting with code %s" % ( signum, const.signalLookup[signum], self.exitCode) # XXX do a terminal write here print "\n" + msg log = registry.getLogger() log.msg(msg) sys.exit(self.exitCode)
def __call__(self, values): if self.__req_fields is None: return values out_value = [] for v in arg_to_iter(values): if all(key.lower() in v.keys() for key in self.__req_fields): out_value.append(v) else : log.msg("Failed to validate %s => %s" % (v, self.__req_fields), level=log.CRITICAL) return out_value
def process_item(self, item, spider): if self.__get_uniq_key() is None: self.collection.insert(dict(item)) else: self.collection.update( {self.__get_uniq_key(): item[self.__get_uniq_key()]}, dict(item), upsert=True) log.msg("Item wrote to MongoDB database %s/%s" % (settings['MONGODB_DB'], settings['MONGODB_COLLECTION']), level=log.DEBUG, spider=spider) return item
def __call__(self, values): out_values = [] for v in arg_to_iter(values): if isinstance(v, (str, unicode)): try: out_values.append(dateutil.parser.parse(str(v), fuzzy=True).strftime(self.format)) except: log.msg('Failed to convert datetime string: "%s"' % v, level=log.WARNING) out_values.append(None) elif isinstance(v, datetime): out_values.append(v.strftime(self.format)) else: out_values.append(datetime(v).strftime(self.format)) return out_values
def process_item(self, item, spider): valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) #if valid: #self.collection.insert(dict(item)) self.collection.update({'headline': item['headline']}, dict(item), upsert=True) logging.msg("Article added to collection!", level=logging.DEBUG, spider=spider) return item
def process_item(self, item, spider): valid = True for data in item: # here we only check if the data is not null # but we could do any crazy validation we want if not data: valid = False raise DropItem("Missing %s of blogpost from %s" % (data, item['url'])) if valid: self.collection.insert(dict(item)) log.msg("Item wrote to MongoDB database %s/%s" % (settings.MONGODB_DB, settings.MONGODB_COLLECTION), level=log.DEBUG, spider=spider) return item
def inner_page(self, response): log.msg(response.url) res = Selector(response) item = WebspiderItem() if not res.xpath('//h1[@class="sight_info_name"]/@title'): yield Request(url=response.url, dont_filter=True) item["detil_title"] = res.xpath('//h1[@class="sight_info_name"]/@title').extract()[0] title_ticket = res.xpath('//h3[@class="ticket_item_title ticket_item_title_mainpage"]/text()').extract() price = res.xpath('//em[@class="txt_orange"]/strong[not(@style)]/text()').extract() if title_ticket and price: item["ticket_title"] = ",".join(["--".join(k) for k in zip(title_ticket, price)]) else: item["ticket_title"] = "" item["introduce"] = "".join( res.xpath('//div[@class="intro_item_des"]/div[@class="module_des_content"]/p/text()').extract() ) yield item
def process_exception(self, request, exception, spider): log.msg("Catch a Exception: ******%s******" %repr(exception), level = log.INFO) log.msg("request is %s" %request.url, level = log.INFO) if request.meta.has_key('proxy'): proxy = request.meta.pop('proxy') if proxy: self.proxies.remove({'ip_port': proxy}) self.bad.append({'ip_port': proxy}) log.msg("Proxy %s cannot ******REACHED******, remove it." %proxy, level = log.INFO) log.msg("Retry the request %s." %request.url, level = log.INFO) return request
def _retry(self, request, reason, spider): retries = request.meta.get('retry_times', 0) + 1 if retries <= self.max_retry_times: log.msg(format="Retrying %(request)s " \ "(failed %(retries)d times): %(reason)s", level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason) retryreq = request.copy() retryreq.meta['retry_times'] = retries retryreq.dont_filter = True # our priority setup is different from super retryreq.meta['priority'] = retryreq.meta['priority'] - 10 return retryreq else: log.msg(format="Gave up retrying %(request)s " \ "(failed %(retries)d times): %(reason)s", level=log.DEBUG, spider=spider, request=request, retries=retries, reason=reason)
def process_item(self, item, spider): if spider.name is not 'movieContent': return item try: logging.msg("[+] " + item["MovieName"]) self.batch.put( item['MovieName'], { "Movie:PostUrl": item['PostUrl'], 'Movie:Director': item['Director'], "Movie:ReleaseTime": item['ReleaseTime'], "Movie:Area": item['Area'], "Movie:Performers": item['Performers'] }) self.batch.send() except: logging.msg("[-] %s Failed." % item["MovieName"]) finally: self.conn.close() return item
def process_request(self, request, spider): print 'process_request' # TODO implement complex proxy providing algorithm if len(self.proxies) < self.MIN_NUM_PROXY: log.msg("The volume of proxy-pool is ******LOW******. Now reload proxy-pool.", level = log.INFO) self.proxies = self.__getproxies__() log.msg("Reload proxies successfully.Now there are %s proxies." %len(self.proxies), level = log.INFO) p = random.choice(self.proxies) try: request.meta['proxy'] = p['ip_port'] log.msg("Add proxy %s" % p['ip_port'], level=log.DEBUG) except Exception, e: log.msg("Exception %s" % e, _level=log.CRITICAL)
def process_item(self, item, spider): book_detail = { 'book_name': item.get('book_name'), 'alias_name': item.get('alias_name', []), 'author': item.get('author', []), 'book_description': item.get('book_description', ''), 'book_covor_image_path': item.get('book_covor_image_path', ''), 'book_covor_image_url': item.get('book_covor_image_url', ''), 'book_download': item.get('book_download', []), 'book_file_url': item.get('book_file_url', ''), 'book_file': item.get('book_file', ''), 'original_url': item.get('original_url', ''), 'update_time': datetime.datetime.utcnow(), } result = self.db['book_detail'].insert(book_detail) item["mongodb_id"] = str(result) log.msg("Item %s wrote to MongoDB database %s/book_detail" % (result, self.MONGODB_DB), level=log.DEBUG, spider=spider) return item
def inner_page(self, response): log.msg(response.url) res = Selector(response) item = WebspiderItem() if not res.xpath('//h1[@class="sight_info_name"]/@title'): yield Request(url=response.url, dont_filter=True) item['detil_title'] = res.xpath( '//h1[@class="sight_info_name"]/@title').extract()[0] title_ticket = res.xpath( '//h3[@class="ticket_item_title ticket_item_title_mainpage"]/text()' ).extract() price = res.xpath( '//em[@class="txt_orange"]/strong[not(@style)]/text()').extract() if title_ticket and price: item['ticket_title'] = ','.join( ['--'.join(k) for k in zip(title_ticket, price)]) else: item['ticket_title'] = '' item['introduce'] = ''.join( res.xpath( '//div[@class="intro_item_des"]/div[@class="module_des_content"]/p/text()' ).extract()) yield item
def process_item(self, item, spider): book_detail = { "book_name": item.get("book_name"), "alias_name": item.get("alias_name", []), "author": item.get("author", []), "book_description": item.get("book_description", ""), "book_covor_image_path": item.get("book_covor_image_path", ""), "book_covor_image_url": item.get("book_covor_image_url", ""), "book_download": item.get("book_download", []), "book_file_url": item.get("book_file_url", ""), "book_file": item.get("book_file", ""), "original_url": item.get("original_url", ""), "update_time": datetime.datetime.utcnow(), } result = self.db["book_detail"].insert(book_detail) item["mongodb_id"] = str(result) log.msg( "Item %s wrote to MongoDB database %s/book_detail" % (result, self.MONGODB_DB), level=log.DEBUG, spider=spider, ) return item
def close_spider(self, spider, reason): if self._dump: log.msg("Dumping Scrapy stats:\n" + pprint.pformat(self.get_stats()), \ spider=spider) self._persist_stats(self.get_stats(), spider)
def read_datas(row): data = { 'category': row['类型'.encode('utf-8')], 'partNumber': row['件号'.encode('utf-8')], 'serialNum': row['序号'.encode('utf-8')], 'name': row['名称'.encode('utf-8')], 'unit': row['单位'.encode('utf-8')], # 'flyTime': float(row['飞行小时'.encode('utf-8')]), # 'engineTime': float(row['发动机小时'.encode('utf-8')]), # 'flightTimes': int(row['起落架次'.encode('utf-8')]), 'applicableModel': "运5B(D)", 'storehouse': row['仓库'.encode('utf-8')], 'minStock': row['最低库存'.encode('utf-8')], 'shelf': row['架位'.encode('utf-8')], 'effectiveDate': row['库存有效期'.encode('utf-8')], 'certificateNum': row['证书编号'.encode('utf-8')], 'airworthinessTagNum': row['适航标签号'.encode('utf-8')], 'lastCheckDate': row['上次检查日期'.encode('utf-8')], 'nextCheckDate': row['下次检查日期'.encode('utf-8')], 'manufacturer': row['生产厂商'.encode('utf-8')], 'supplier': row['供应商'.encode('utf-8')], } if row['数量'.encode('utf-8')]: data['quantity'] = int(row['数量'.encode('utf-8')]) if row['冻结数量'.encode('utf-8')]: data['freezingQuantity'] = int(row['冻结数量'.encode('utf-8')]) if row['起落架次'.encode('utf-8')]: data['flightTimes'] = int(row['起落架次'.encode('utf-8')]) if row['飞行小时'.encode('utf-8')]: data['flyTime'] = row['飞行小时'.encode('utf-8')] if row['发动机小时'.encode('utf-8')]: data['engineTime'] = row['发动机小时'.encode('utf-8')] if not data['name'] or not data['partNumber']: logging.warn("名称(%s)或件号(%s)不存在。" % ( data['name'].decode('utf-8'), data['partNumber'])) return None if data['minStock'] and data['minStock'] <= 0: logging.msg("航材(%s)的最低库存应大于0" % data['name'].decode("utf-8")) return None if data['partNumber'] and data['serialNum'] and data['quantity'] != 1: logging.warn("件号和序号都存在时,数量必须为1。第%s条数据" % row['序号1'.encode('utf-8')]) return None if data['freezingQuantity'] < 0 or \ data['freezingQuantity'] > data['quantity']: logging.warn("冻结数量应至少为0,且不大于数量。第%s条数据" % row['序号1'.encode('utf-8')]) return None if data['lastCheckDate']: try: date = datetime.strptime(data['lastCheckDate'], "%Y-%m-%d") except Exception as e: logging.warn("lastCheckDate is wrong. number:%s" % row['序号1'.encode('utf-8')]) return if data['nextCheckDate']: try: date = datetime.strptime(data['nextCheckDate'], "%Y-%m-%d") except Exception as e: logging.warn("nextCheckDate is wrong. number:%s" % row['序号1'.encode('utf-8')]) return am = AirmaterialCategory.query.filter( AirmaterialCategory.partNumber == data['partNumber'], AirmaterialCategory.category == data['category'], AirmaterialCategory.name == data['name']).first() if not am: logging.warn("该库存对应的航材不存在或库存的名称或类型不对应. 件号:%s名称:%s" % ( data['partNumber'], data['name'].decode("utf-8"))) return None return data
def spider_closed(spider): logging.msg('Spider closed: %s' % spider, level=log.INFO) print results
def parseReview(self, response): sel = Selector(response) review_list = [] hotel_overview = {} # hotel profile hotel_url = sel.xpath( '/html/body/div/div/div/div[2]/a[1]/@href').extract() hotel_overview['url'] = 'http://hotels.ctrip.com' + str( hotel_url[0].split('_')[0]) hotel_overview['total_overall_rating'] = \ sel.xpath('/html/body/div/div/div/div[1]/div[1]/span[2]/span/text()').extract()[0].strip() hotel_overview['per_recomment'] = \ sel.xpath('/html/body/div/div/div/div[1]/div[1]/span[3]/span/text()').extract()[0].strip() hotel_overview['for_biz'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[1]/span/text()').extract()[0].strip())[0] hotel_overview['for_friend'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[2]/span/text()').extract()[0].strip())[0] hotel_overview['for_couple'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[3]/span/text()').extract()[0].strip())[0] hotel_overview['for_family'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[4]/span/text()').extract()[0].strip())[0] hotel_overview['for_single'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[5]/span/text()').extract()[0].strip())[0] hotel_overview['for_agent'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[6]/span/text()').extract()[0].strip())[0] hotel_overview['for_others'] = \ re.findall(r'\d+', sel.xpath('//*[@id="comment_statistics"]/a[7]/span/text()').extract()[0].strip())[0] hotel_overview['avg_location'] = sel.xpath( '/html/body/div/div/div/div[1]/div[3]/p[1]/span/text()').extract( )[0].strip() hotel_overview['avg_facility'] = sel.xpath( '/html/body/div/div/div/div[1]/div[3]/p[2]/span/text()').extract( )[0].strip() hotel_overview['avg_service'] = sel.xpath( '/html/body/div/div/div/div[1]/div[3]/p[3]/span/text()').extract( )[0].strip() hotel_overview['avg_clean'] = sel.xpath( '/html/body/div/div/div/div[1]/div[3]/p[4]/span/text()').extract( )[0].strip() hotel_overview['all_comment'] = \ re.findall(r'\d+', sel.xpath('//*[@id="All_Commnet"]/text()').extract()[0].strip())[0] hotel_overview['recomment'] = re.findall( r'\d+', sel.xpath('//*[@id="Recomment"]/text()').extract()[0].strip())[0] hotel_overview['no_recomment'] = \ re.findall(r'\d+', sel.xpath('//*[@id="No_Recoment"]/text()').extract()[0].strip())[0] review_list.append(dict(hotel_overview)) try: # Number of reviews per page num = len(sel.xpath('/html/body/div/div/div/div[3]/text()')) # Hotel Profile for flag in xrange(1, num): # Review item = hotelReview() print flag author = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[1]/p[2]/text()').extract() user_type = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[1]/p[1]/@title').extract() date = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/p/span[3]/a/text()').extract() room_type = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[1]/p[3]/text()').extract() review_overall_rating = sel.xpath( '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/p/span[2]/span/text()').extract() review_aspect_rating = sel.xpath( '/html/body/div/div/div/div[3]/div[' + str(flag) + ']/p/span[1]/@data-value').extract() helpful = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[2]/a/span/text()').extract() review = sel.xpath('/html/body/div/div/div/div[3]/div[' + str(flag) + ']/div[2]/text()').extract() # print str(response.body).decode('GB2312').encode('utf8') filename = response.url.split('?')[1].split('&')[1].split( '=')[1] print 'HIIIIIIIIIII' print filename # item is an object item['author'] = author[0].strip() item['user_type'] = user_type[0].strip() item['date'] = date[0].strip() item['room_type'] = room_type[0].strip() item['review_overall_rating'] = review_overall_rating[0].strip( ) # """ # "clean": ["", " ", "卫生:5", " ", "服务:5", " ", "设施:5", " ", "位置:5\r\n", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""] # """ item['location'] = re.findall( r'\d+', review_aspect_rating[0].strip().split(',')[0])[0] item['facility'] = re.findall( r'\d+', review_aspect_rating[0].strip().split(',')[1])[0] item['service'] = re.findall( r'\d+', review_aspect_rating[0].strip().split(',')[2])[0] item['clean'] = re.findall( r'\d+', review_aspect_rating[0].strip().split(',')[3])[0] item['review'] = review[0].strip() item['helpful'] = re.findall(r'\d+', helpful[0].strip())[0] review_list.append(dict(item)) print review_list # Write the file like the pipe con = json.dumps(review_list, ensure_ascii=False).encode('utf8') self.writeAppendFile(filename, con) except: log.msg("Review Error !!!!" + response.url, level=log.WARNING)
def get_linkedin_id(self, url): find_index = url.find("www.linkedin.com/") if find_index >= 0: log.msg(url, url[find_index + 13:].replace('/', '-')) return url[find_index + 13:].replace('/', '-') return None
def process_request(self, request, spider): agent = random.choice(AGENTS) request.headers['User-Agent'] = agent log.msg("Add agent %s" % agent, level=log.DEBUG)
def process_response(self, request, response, spider): log.msg("Response Status code is : %s." %repr(response.status), level = log.INFO) log.msg("Response Headers is :\n %s." %repr(response.headers), level = log.INFO) if response.status == 302 and "Location" in response.headers: proxy = request.meta.pop('proxy') self.proxies.remove({'ip_port': proxy}) self.anti.append({'ip_port': proxy}) log.msg("Proxy %s has been ******ANTIED******, remove it." %proxy, level = log.INFO) log.msg("Retry the request %s." %request.url, level = log.INFO) log.msg("Original request is %s" %request.url, level = log.INFO) log.msg("Response request is %s" %repr(response.request), level = log.INFO) return request elif response.status == 404: proxy = request.meta.pop('proxy') self.proxies.remove({'ip_port': proxy}) self.anti.append({'ip_port': proxy}) log.msg("Proxy %s is ******NOT WORK******, remove it." %proxy, level = log.INFO) log.msg("Retry the request %s." %request.url, level = log.INFO) log.msg("Original request is %s" %request.url, level = log.INFO) log.msg("Response request is %s" %repr(response.request), level = log.INFO) return request elif response.status in [403,]: proxy = request.meta.pop('proxy') self.proxies.remove({'ip_port': proxy}) self.anti.append({'ip_port': proxy}) log.msg("Proxy %s has been ******FORBIDDEN******, remove it." %proxy, level = log.INFO) log.msg("Retry the request %s." %request.url, level = log.INFO) log.msg("Original request is %s" %request.url, level = log.INFO) log.msg("Response request is %s" %repr(response.request), level = log.INFO) return request elif response.status in [500, 501, 502, 503, 504, 505]: proxy = request.meta.pop('proxy') self.proxies.remove({'ip_port': proxy}) self.anti.append({'ip_port': proxy}) log.msg("Proxy %s is ******NOT WORK******, remove it." %proxy, level = log.INFO) log.msg("Retry the request %s." %request.url, level = log.INFO) log.msg("Original request is %s" %request.url, level = log.INFO) log.msg("Response request is %s" %repr(response.request), level = log.INFO) return request else: return response