class ChinaLoader(NewsLoader): ''' 该类为ItemLoader的子类,为实现对Item的配置化提取所做的重写/ 涉及多个继承 ''' text_out = Compose(Join(), lambda s: s.strip()) source_out = Compose(Join(), lambda s: s.strip())
class AskciAStockLoader(TakeLoader): text_out = Compose(Join(), lambda s: s.strip()) source_out = Compose(Join(), lambda s: s.strip())
class FishLoader(ItemLoader): default_item_class = FishItem default_input_processor = MapCompose(lambda s: s.strip()) default_output_processor = TakeFirst() description_out = Join()
class kLiveFutureItem(Item): StockIdentifier_f = Field(output_processor=Join()) Open = Field(input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join()) High = Field(input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join()) Low = Field(input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join()) PrevClose = Field(input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join()) LastTradedPrice = Field(input_processor=MapCompose( utility.strToFloatNumber), output_processor=Join()) Volume = Field(input_processor=MapCompose(utility.strToIntNumber), output_processor=Join()) Turnover = Field(input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join()) UnderlyingValue = Field(input_processor=MapCompose( utility.strToFloatNumber), output_processor=Join()) AnnualisedVolatility = Field(input_processor=MapCompose( utility.strToFloatNumber), output_processor=Join()) DailyVolatility = Field(input_processor=MapCompose( utility.strToFloatNumber), output_processor=Join()) OpenInterest = Field(input_processor=MapCompose(utility.strToIntNumber), output_processor=Join()) ChangeInOI = Field(input_processor=MapCompose(utility.strToIntNumber), output_processor=Join()) PerChangeInOI = Field(input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join())
class AtobodetailItem(Item): c_name = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) url = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) category = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) hangye = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) quxian = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) ximu = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) mingcheng = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) zhuangtai = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) zhucehao = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) farendaibiao = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) zhuceziben = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) qiyeleixing = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) dengjijiguan = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) chengliriqi = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) yingyeqixian = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) jingyingfanwei = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) zhucedizhi = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) gaoguan1 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) zhiwei1 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) gaoguan2 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) zhiwei2 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) gaoguan3 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) zhiwei3 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) gaoguan4 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) zhiwei4 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) gudong1 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) renjiao1 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) shijiao1 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) chigubili1 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) gudong2 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) renjiao2 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) shijiao2 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) chigubili2 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) gudong3 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) renjiao3 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) shijiao3 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) chigubili3 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) duiwaitouzi = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) wangdian1 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) lianxiren1 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) lianxidianhua1 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) shouji1 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) zhuyingchanpin1 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) wangdian2 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) lianxiren2 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) lianxidianhua2 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) shouji2 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), ) zhuyingchanpin2 = Field( input_processor=MapCompose(remove_tags), output_processor=Join(), )
def custom_field(): return scrapy.Field(input_processor=MapCompose(DataUtils.remove_html), output_processor=Join())
class RiLab01Loader(ItemLoader): default_output_processor = TakeFirst() text_out = Join()
class Century21OfficeLoader(ItemLoader): default_input_processor = MapCompose(remove_tags, str.strip) default_output_processor = TakeFirst() officePhone_in = MapCompose(serialize_number) officeAddress_out = Join(', ')
class ZhipinItem(scrapy.Item): #boss直聘网职位信息 title = scrapy.Field() url = scrapy.Field() url_object_id = scrapy.Field() salary = scrapy.Field() job_city = scrapy.Field(input_processor=MapCompose(get_city)) work_years = scrapy.Field(input_processor=MapCompose(get_experience)) degree_need = scrapy.Field(input_processor=MapCompose(get_degree)) publish_time = scrapy.Field(input_processor=MapCompose(remove_chinaese)) job_advantage = scrapy.Field( input_processor=MapCompose(Null_if), output_processor=Join(), ) job_desc = scrapy.Field( input_processor=MapCompose(Null_if), output_processor=Join(), ) job_addr = scrapy.Field( input_processor=MapCompose(remove_tags, handle_jobaddr), #去除html的tags ) company_name = scrapy.Field() company_url = scrapy.Field() tags = scrapy.Field(input_processor=Join(",")) crawl_time = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into job(title, url, url_object_id, salary, job_city, work_years, degree_need, publish_time, job_advantage, job_desc, job_addr, company_name, company_url, tags, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_desc=VALUES(job_desc) """ params = ( self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"], self["work_years"], self["degree_need"], self["publish_time"], self["job_advantage"], self["job_desc"], self["job_addr"], self["company_name"], self["company_url"], self["tags"], self["crawl_time"].strftime(SQL_DATETIME_FORMAT), ) return insert_sql, params def save_to_es(self): job = LagouType() job.title = self['title'] if "create_date" in self: job.create_date = self["create_date"] job.url = self["url"] job.meta.id = self["url_object_id"] if "salary" in self: job.salary = self["salary"] job.job_city = self["job_city"] if "work_years" in self: job.work_years = self["work_years"] job.degree_need = self["degree_need"] job.tags = self["tags"] job.publish_time = self["publish_time"] job.job_advantage = self["job_advantage"] job.job_desc = self["job_desc"] job.job_addr = self["job_addr"] job.company_name = self["company_name"] job.company_url = self["company_url"] job.crawl_time = self["crawl_time"] job.suggest = gen_suggests(LagouType._doc_type.index, ((job.title, 10), (job.tags, 7))) job.save() redis_cli.incr("jobbole_count") return
class ChinaDigiLoader(NewsLoader): text_out = Compose(Join(), lambda s: s.strip()) source_out = Compose(Join(), lambda s: s.strip())
class newsItem(Item): description = Field(input_processor=MapCompose(remove_tags, replace_entities, replace_escape_chars, str.strip), output_processor=Join())
class ChinaTechLoader(NewsLoader): """继承NewsLoader""" # Join()将列表拼接成字符串,lambda xxxx: 去掉前后空白字符 text_out = Compose(Join(), lambda s: s.strip()) source_out = Compose(Join(), lambda s: s.strip())
class JobBoleArticleItem(scrapy.Item): title = scrapy.Field( input_processor=MapCompose(lambda x: x + "-jobbole", add_jobbole) ) create_date = scrapy.Field( input_processor=MapCompose(date_convert), output_processor=TakeFirst() ) url = scrapy.Field() url_object_id = scrapy.Field() front_image_url = scrapy.Field( output_processor=MapCompose(return_value) ) front_image_path = scrapy.Field() praise_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) comment_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) fav_nums = scrapy.Field( input_processor=MapCompose(get_nums) ) tags = scrapy.Field( input_processor=MapCompose(remove_comment_tags), output_processor=Join(",") ) content = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into jobbole_article(title, url, create_date, fav_nums, front_image_url, front_image_path, parise_nums, comment_nums, tags, content) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE fav_nums=VALUES(fav_nums), front_image_url=VALUES(front_image_url), front_image_path(front_image_path), content=VALUES(content), parise_nums=VALUES(praise_nums), comment_nums=VALUES(comment_nums), tags=VALUES(tags) """ params = (self["title"], self["url"], self["create_date"], self["fav_nums"]) return insert_sql, params def save_to_es(self): article = ArticleType() article.title = self['title'] article.create_date = self['create_date'] article.content = remove_tags(self['content']) article.front_image_url = self['front_image_url'] if 'front_image_path' in self: article.front_image_path = self['front_image_path'] article.praise_nums = self['praise_nums'] article.fav_nums = self['fav_nums'] article.comment_nums = self['comment_nums'] article.url = self['url'] article.tags = self['tags'] article.meta.id = self['url_object_id'] article.suggest = gen_suggests(ArticleType._doc_type.index, ((article.title, 10), (article.tags, 7))) article.save() redis_cli.incr("jobbole_count") return
class EntryLoader(ItemLoader): content_in = MapCompose(unicode.strip) content_out = Join()
def url_field(): return scrapy.Field(input_processor=MapCompose( DataUtils.remove_html, lambda value: value.rstrip('//')), output_processor=Join())
def parse(self, response): def deal_publish_time(publish_time_raw=None): if publish_time_raw: #需要注意这里边可能没有提取到,如果没有取到,这里其实也有可能不是none,而是没有传入数据,所以要在开头默认赋值 mouth_str_dict = { 'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12', } publish_mouth = publish_time_raw.split(' ') if str(publish_mouth[0].strip()) in mouth_str_dict.keys(): try: mouth_num_str = mouth_str_dict[str( publish_mouth[0].strip())] publish_time = str(publish_mouth[2].strip( )) + '-' + mouth_num_str + '-' + str(publish_mouth[1]) publish_date = publish_time.strip(',') + ' 00:00:00' time_tuple = time.strptime(publish_date, '%Y-%m-%d %H:%M:%S') publish_time = time.mktime(time_tuple) return str(int(publish_time)) except Exception as e: print(e) else: return publish_mouth else: return None def deal_publisher(html_raw): response_publisher = scrapy.http.HtmlResponse( url='thisIsJavaScript', body=str(html_raw)) publish_user = response_publisher.xpath( './/span[@class="byline-author"]/text()').extract_first( default=None) publish_user = publish_user.split(',')[0].split('by')[1].split( 'and') print(publish_user) return publish_user for i in response.xpath('//*[@id="Blog1"]/div[@class="post"]'): itemloderArticle = ItemLoader(item=ThreatcollectItem(), selector=i) itemloderArticle.add_xpath('title', './/h2/a/text()', TakeFirst()) itemloderArticle.add_xpath('url', './/h2/a/@href', TakeFirst()) itemloderArticle.add_xpath( 'publish_time', './/div[@class="post-header"]/div[@class="published"]/span/text()', Join(), deal_publish_time) itemloderArticle.add_xpath( 'content', './/div[@class="post-body"]/div[contains(@class,"post-content")]/script/text()', MapCompose(remove_tags)) itemloderArticle.add_xpath('article_id', './/@data-id') itemloderArticle.add_value('img_urls', i.re(r'src="(.*?)"')) itemloderArticle.add_value('spider_time', time.time() * 1000) itemloderArticle.add_xpath( 'publisher', './/div[@class="post-body"]/div[contains(@class,"post-content")]/script/text()', deal_publisher) itemloderArticle.add_value('html', i.extract()) item1 = itemloderArticle.load_item() yield item1 # yield response.follow(url=item1['url'],headers=self.headers,meta={'item':item1},callback=self.parse_item) nexturl = response.xpath( '//*[@id="Blog1_blog-pager-older-link"]/@href').extract()
def category_field(): return scrapy.Field(input_processor=MapCompose(lambda value: value.name), output_processor=Join())
def parse_keywords(response): il = ItemLoader(item=TnaWebsiteItem(), response=response) il.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) il.default_output_processor = Join() il.add_xpath('KEYWORDS', '//meta[@name="keywords"]/@content') return il.load_item()
class LagouItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() position_name = scrapy.Field(input_processor=Join()) exp_lvl = scrapy.Field(input_processor=Join()) edu_lvl = scrapy.Field(input_processor=Join()) position_type = scrapy.Field(input_processor=Join()) position_id = scrapy.Field(input_processor=Join()) position_url = scrapy.Field(input_processor=Join()) finance_stage = scrapy.Field(input_processor=Join()) industry_field = scrapy.Field(input_processor=Join()) company_name = scrapy.Field(input_processor=Join()) work_city = scrapy.Field(input_processor=Join()) salary = scrapy.Field(input_processor=Join()) position_advantage = scrapy.Field(input_processor=Join()) publish_date = scrapy.Field(input_processor=Join()) company_attr = scrapy.Field(input_processor=Join()) skill_label = scrapy.Field(input_processor=Join())
class TakeFirstItemLoader(TestItemLoader): name_out = Join("<br>")
class kLiveEquityItem(Item): Symbol = Field(output_processor=Join()) Open = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500.00' High = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500.00' Low = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500.00' # PrevClose = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(),)# '7,500.00' to '7500.00' LTP = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500' TotalChange = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500.00'# In lacks PerChange = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500' Volume = Field( input_processor=MapCompose(utility.strToIntNumber), output_processor=Join(), ) Turnover = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500.00'# In lacks # TotalBuyQuantity = Field(input_processor=MapCompose(utility.strToIntNumber),output_processor=Join(),)# '7,500.00' to '7500' # TotalSellQuantity = Field(input_processor=MapCompose(utility.strToIntNumber),output_processor=Join(),) DatabaseTableName = Field(output_processor=Join()) TimeStamp = Field(output_processor=Join())
class TestItemLoader(ItemLoader): default_item_class = TestItem name_out = Compose(Join(), float)
class kLiveIndexItem(Item): Symbol = Field(output_processor=Join()) Open = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500.00' High = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500.00' Low = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500.00' LTP = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500' TotalChange = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500.00'# In lacks PerChange = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500' Volume = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) Turnover = Field( input_processor=MapCompose(utility.strToFloatNumber), output_processor=Join(), ) # '7,500.00' to '7500.00'# In lacks DatabaseTableName = Field(output_processor=Join()) TimeStamp = Field(output_processor=Join())
def test_join(self): proc = Join() self.assertRaises(TypeError, proc, [None, '', 'hello', 'world']) self.assertEqual(proc(['', 'hello', 'world']), u' hello world') self.assertEqual(proc(['hello', 'world']), u'hello world') self.assertIsInstance(proc(['hello', 'world']), six.text_type)
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time_list=[]): if not publish_time_list: print('time is None') return None if len(publish_time_list) == 3: publish_time_list1 = [] for time_num in publish_time_list: time_num = str(time_num) if len(time_num) < 2: time_num1 = '0' + time_num publish_time_list1.append(time_num1) else: publish_time_list1.append(str(time_num)) publish_time_str = publish_time_list1[ 2] + '-' + publish_time_list1[ 0] + '-' + publish_time_list1[1] publish_time_str = publish_time_str.strip('-') publish_time_str += ' 00:00:00' return publish_time_str else: print('publish_time_wrong') return None def deal_video_urls(video_urls): video_urls_list = [] for one_video_url in video_urls: one_video_url = one_video_url.strip('/') if 'http' not in one_video_url: one_video_url = 'http://' + one_video_url video_urls_list.append(one_video_url) return video_urls_list loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_value('id', response.url.split('/')[-1].split('.')[0]) loader1.add_xpath( 'title', '//div[@id="main"]//div[@class="post-outer"]/h2/text()', lambda x: x[0].strip()) loader1.add_xpath( 'content', '//div[@id="main"]//div[@class="post-entry"]//text()', lambda x: [x1.strip() for x1 in x], Join()) loader1.add_value( 'publish_time', response.xpath( '//div[@id="main"]//div[@class="date-outer"]//span[@class="heading-date"]' ).re('(\d{1,2})\/(\d{1,2})\/(\d{4})'), deal_publish_time) loader1.add_xpath( 'img_urls', '//div[@id="main"]//div[@class="post-entry"]//img/@src') loader1.add_xpath( 'video_urls', '//div[@id="main"]//div[@class="post-entry"]//iframe/@src', deal_video_urls) item = loader1.load_item() print(item) return item
class CourseInfoItem(PortiaItem): edu_method_cd = scrapy.Field( input_processor=Text(), output_processor=Join(), ) edu_location_desc = scrapy.Field( input_processor=Text(), output_processor=Join(), ) course_period = scrapy.Field( input_processor=Text(), output_processor=Join(), ) all_eval_accept_yn = scrapy.Field( input_processor=Text(), output_processor=Join(), ) edu_quota_cnt = scrapy.Field( input_processor=Text(), output_processor=Join(), ) link_url = scrapy.Field( input_processor=Text(), output_processor=Join(), ) enroll_appl_method_cd = scrapy.Field( input_processor=Text(), output_processor=Join(), ) edu_cycle_content = scrapy.Field( input_processor=Text(), output_processor=Join(), ) job_ability_course_yn = scrapy.Field( input_processor=Text(), output_processor=Join(), ) cb_eval_accept_yn = scrapy.Field( input_processor=Text(), output_processor=Join(), ) hrg_handicap_supp_yn = scrapy.Field( input_processor=Text(), output_processor=Join(), ) receive_period = scrapy.Field( input_processor=Text(), output_processor=Join(), ) lang_cd = scrapy.Field( input_processor=Text(), output_processor=Join(), ) vsl_handicap_supp_yn = scrapy.Field( input_processor=Text(), output_processor=Join(), ) edu_target_cd = scrapy.Field( input_processor=Text(), output_processor=Join(), ) org = scrapy.Field( input_processor=Text(), output_processor=Join(), ) course_desc = scrapy.Field( input_processor=Text(), output_processor=Join(), ) course_nm = scrapy.Field( input_processor=Text(), output_processor=Join(), ) enroll_amt = scrapy.Field( input_processor=Text(), output_processor=Join(), ) inquiry_tel_no = scrapy.Field( input_processor=Text(), output_processor=Join(), ) teacher_pernm = scrapy.Field( input_processor=Text(), output_processor=Join(), )
def parse_restaurant(self, response): loader = ItemLoader(item=RestaurantItem(source=self.name, language='en', last_update=int(time.time())), response=response) loader.default_input_processor = Compose( MapCompose(lambda x: x.strip() or None)) loader.default_output_processor = TakeFirst() url = url_query_cleaner(response.url) loader.add_value('url', url) id = urllib.unquote(urlparse.urlparse(url).path.split('/')[-1]) loader.add_value('id', id) loader.add_xpath( 'name', '//div[contains(@class, "biz-page-header")]//h1[contains(@class, "biz-page-title")]/text()' ) loader.address_out = Join(' - ') loader.add_xpath('address', "//div[contains(@class, 'map-box-address')]//text()") loader.add_xpath('geolocation', "//div[@class='mapbox-map']//img/@src", MapCompose(lambda url: parse_qs(url).get('center'))) loader.add_xpath( 'phone_number', "//div[@class='mapbox-text']//span[@class='biz-phone']/text()") hours_loader = loader.nested_xpath( "//div[contains(@class, 'biz-hours')]//tr/th[@scope]/..") hours_loader.opening_hours_in = Compose(group_items(3)) hours_loader.opening_hours_out = Identity() hours_loader.add_xpath( 'opening_hours', './th/text() | ./td/span[@class="nowrap"]/text()') loader.add_xpath( 'rating', '//div[contains(@class, "biz-page-header")]//div[contains(@class, "biz-rating")]/div[contains(@class, "i-stars")]/@title', re=r'(?:\D*)(\d+(?:\.\d+)?)') loader.number_of_reviews_in = MapCompose(int) loader.add_xpath( 'number_of_reviews', '//div[contains(@class, "biz-page-header")]//span[contains(@class, "review-count")]/text()', re=r'^\D*(\d+)') info_loader = loader.nested_xpath( '//div[contains(@class, "sidebar")]//div[@class="ywidget"]/ul[@class="ylist"]/li/div[contains(@class, "short-def-list")]/dl' ) info_loader.info_in = Compose(MapCompose(unicode.strip), group_items(2)) info_loader.info_out = Identity() info_loader.add_xpath( 'info', './dt[@class="attribute-key"]/text() | ./dd/text()') item = loader.load_item() menu_url = TakeFirst()(response.xpath( '//h3[@class="menu-preview-heading"]/a/@href').extract()) if menu_url: yield scrapy.Request(response.urljoin(menu_url), callback=self.parse_menu, meta={'item': item}) else: yield item
def parse(self, response): #All data must be extracted using XPATH queries #This path should return a list of block of HTML code that contain the information about the listings items = response.xpath("//article[contains(@class,'property-row')]") for item in items: l = ItemLoader(item=RentslamItem(), response=response) #All data must be extracted using XPATH queries image_url = item.xpath('.//img/@src').extract_first() url = item.xpath('.//a/@href').extract_first() price = item.xpath( './/span[contains(@class,"property-row-meta-item-price")]/strong/text()' ).extract_first() bedrooms = item.xpath( './/span[contains(@class,"property-row-meta-item-beds")]/strong/text()' ).extract_first() size = item.xpath( './/span[contains(@class,"property-row-meta-item-area")]/strong/text()' ).extract_first() address = item.xpath('.//h2/a/text()').extract_first() text = item.xpath( './/div[@class="property-row-body"]/p/text()').extract_first() city = item.xpath('.//div[@class="property-row-location"]/a/text()' ).extract_first() #In this example there is no furnishing info, it can be left enpty #furnishing = item.xpath('').extract_first() #Full url. Only the first image is required l.add_value('ImageUrl', image_url) #Full url l.add_value('Url', url) #Price must not include currency symbol, dot or comma. Decimals must be filtered out. Example: € 1.348,77 --> 1348 l.add_value('Price', price, Join(''), re=r'\d+') #Number l.add_value('Bedrooms', bedrooms) #Size must include only the number. Things like "m2" must be filtered out. Example: 90 m2 --> 90 l.add_value('Size', size, TakeFirst(), re=r'\d+') #The address must contain the street name and the house number (if it is present). It must not contain the city name or the postcode l.add_value('Address', address) #This is the description of the listing l.add_value('Text', text) #You can copy th email address from the website here l.add_value('ContactEmailAddress', '*****@*****.**') #You can copy th phoen number from the website here l.add_value('ContactPhoneNumber', '085 - 273 67 30') #In this example there is no furnishing info, it can be left enpty #l.add_value('Furnishing', furnishing) #Name of the city. Sometimes it can have a literal value, like "Amsterdam", if the website only contains listings from amsterdam. l.add_value('City', city) yield l.load_item()
class GdSz6652Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field( input_processor=MapCompose(remove_tags, lambda x: x.strip()), output_processor=Join(), ) amount = scrapy.Field( input_processor=MapCompose(remove_tags, lambda x: x.strip()), output_processor=Join(), ) rate = scrapy.Field( input_processor=MapCompose(remove_tags, lambda x: x.strip()), output_processor=Join(), ) period = scrapy.Field( input_processor=MapCompose(remove_tags, lambda x: x.strip()), output_processor=Join(), ) start = scrapy.Field( input_processor=MapCompose(remove_tags, lambda x: x.strip()), output_processor=Join(), ) end = scrapy.Field( input_processor=MapCompose(remove_tags, lambda x: x.strip()), output_processor=Join(), ) invest_records = scrapy.Field(output_processor=Join()) pay_type = scrapy.Field( input_processor=MapCompose(remove_tags, lambda x: x.strip()), output_processor=Join(), ) loaner_info = scrapy.Field( input_processor=MapCompose(remove_tags, lambda x: x.strip()), output_processor=Join(), ) loan_using = scrapy.Field( input_processor=MapCompose(remove_tags, lambda x: x.strip()), output_processor=Join(), ) loan_info = scrapy.Field( input_processor=MapCompose(remove_tags, lambda x: x.strip()), output_processor=Join(), ) progress = scrapy.Field( input_processor=MapCompose(remove_tags, lambda x: x.strip()), output_processor=Join(), ) code = scrapy.Field( input_processor=MapCompose(remove_tags, lambda x: x.strip()), output_processor=Join(), ) web_name = scrapy.Field(output_processor=Join()) url = scrapy.Field(output_processor=Join()) web_code = scrapy.Field(output_processor=Join()) item_code = scrapy.Field(output_processor=Join()) a = scrapy.Field() b = scrapy.Field()
class LagouJobItem(scrapy.Item): # 拉勾网职位信息 title = scrapy.Field() url = scrapy.Field() url_object_id = scrapy.Field() salary_min = scrapy.Field() salary_max = scrapy.Field() job_city = scrapy.Field(input_processor=MapCompose(remove_splash), ) work_years_min = scrapy.Field(input_processor=MapCompose(remove_splash), ) work_years_max = scrapy.Field(input_processor=MapCompose(remove_splash), ) degree_need = scrapy.Field(input_processor=MapCompose(remove_splash), ) job_type = scrapy.Field() publish_time = scrapy.Field() job_advantage = scrapy.Field() job_desc = scrapy.Field() job_addr = scrapy.Field(input_processor=MapCompose(remove_tags, handle_jobaddr), ) company_name = scrapy.Field() company_url = scrapy.Field() tags = scrapy.Field(input_processor=Join(",")) crawl_time = scrapy.Field() crawl_update_time = scrapy.Field() def get_insert_sql(self): insert_sql = """ insert into lagou_job(title, url, url_object_id, salary_min, salary_max, job_city, work_years_min, work_years_max, degree_need, job_type, publish_time, job_advantage, job_desc, job_addr, company_name, company_url, tags, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE salary_min=VALUES(salary_min), salary_max=VALUES(salary_max), job_desc=VALUES(job_desc) """ match_obj1 = re.match("经验(\d+)-(\d+)年", self['work_years_min']) match_obj2 = re.match("经验应届毕业生", self['work_years_min']) match_obj3 = re.match("经验不限", self['work_years_min']) match_obj4 = re.match("经验(\d+)年以下", self['work_years_min']) match_obj5 = re.match("经验(\d+)年以上", self['work_years_min']) if match_obj1: self['work_years_min'] = match_obj1.group(1) self['work_years_max'] = match_obj1.group(2) elif match_obj2: self['work_years_min'] = 0.5 self['work_years_max'] = 0.5 elif match_obj3: self['work_years_min'] = 0 self['work_years_max'] = 0 elif match_obj4: self['work_years_min'] = 0 self['work_years_max'] = match_obj4.group(1) elif match_obj5: self['work_years_min'] = match_obj4.group(1) self['work_years_max'] = match_obj4.group(1) + 100 else: self['work_years_min'] = 999 self['work_years_max'] = 999 match_salary = re.match("(\d+)[Kk]-(\d+)[Kk]", self['salary_min']) if match_salary: self['salary_min'] = match_salary.group(1) self['salary_max'] = match_salary.group(2) else: self['salary_min'] = 666 self['salary_max'] = 666 match_time1 = re.match("(\d+):(\d+).*", self["publish_time"]) match_time2 = re.match("(\d+)天前.*", self["publish_time"]) match_time3 = re.match("(\d+)-(\d+)-(\d+)", self["publish_time"]) if match_time1: today = datetime.datetime.now() hour = int(match_time1.group(1)) minutues = int(match_time1.group(2)) time = datetime.datetime(today.year, today.month, today.day, hour, minutues) self["publish_time"] = time.strftime(SQL_DATETIME_FORMAT) elif match_time2: days_ago = int(match_time2.group(1)) today = datetime.datetime.now() - datetime.timedelta(days=days_ago) self["publish_time"] = today.strftime(SQL_DATETIME_FORMAT) elif match_time3: year = int(match_time3.group(1)) month = int(match_time3.group(2)) day = int(match_time3.group(3)) today = datetime.datetime(year, month, day) self["publish_time"] = today.strftime(SQL_DATETIME_FORMAT) else: self["publish_time"] = datetime.datetime.now().strftime( SQL_DATETIME_FORMAT) params = ( self["title"], self["url"], self["url_object_id"], self["salary_min"], self["salary_max"], self["job_city"], self["work_years_min"], self["work_years_max"], self["degree_need"], self["job_type"], self["publish_time"], self["job_advantage"], self["job_desc"], self["job_addr"], self["company_name"], self["company_url"], self["tags"], self["crawl_time"].strftime(SQL_DATETIME_FORMAT), ) return insert_sql, params