class FolhaSpider(CrawlSpider): name = FOLHA_SPIDER_NAME allowed_domains = [FOLHA_DOMAIN] start_urls = FOLHA_START_URLS rules = [ Rule( SgmlLinkExtractor(allow=(FOLHA_PODER), ), callback='parse_item', follow=True, ), Rule( SgmlLinkExtractor(allow=(FOLHA_PODEREPOLITICA), ), callback='parse_item', follow=True, ), ] def parse_item(self, response): sel = Selector(response) article = ArticleItem() article['source'] = 'Folha de S.Paulo' article['url'] = response.url title = sel.xpath(FOLHA_ARTICLE_TITLE).extract() article['title'] = title[0] if title else None pub_date = sel.xpath(FOLHA_ARTICLE_PUB_DATE).extract()[0] print pub_date, " <<<<<< aqui" article['pub_date'] = datetime.strptime(pub_date, "%Y-%m-%d %H:%M") content = ' '.join(sel.xpath(FOLHA_ARTICLE_CONTENT).extract()) article['body'] = content if content else None links = sel.xpath('//article//a/@href').extract() links = list(set(links)) try: links.remove('javascript:;') except Exception: pass article['links'] = links return article
def _compile_rule(self, rule_dict): extractor = SgmlLinkExtractor2(allow=rule_dict['allow'], check_url=rule_dict.get('check_url', True)) rule = Rule(extractor) def get_method(method): if callable(method): return method elif isinstance(method, basestring): return getattr(self, method, None) else: return None rule.process_links = get_method(rule_dict.get('process_links')) #set default link type to leaf rule.link_type = rule_dict.get('link_type', '') return rule
class SggonguoSpider(BaseSpider): name = "sggongzuo" allowed_domains = ["gongzuo.sg"] start_urls = ('http://www.gongzuo.sg', ) rules = (Rule( LinkExtractor(allow='/\?page=[0-1]'), callback='parse_item', follow=True, ), ) def parse_start_url(self, response): return self.parse_item(response) def parse_item(self, response): return self.parse_item_requests_callback(response, '//div[@class="summary"]') def populate_job_crawler_item(self, detail_item, job_crawler_item): try: job_crawler_item.job_title = detail_item.xpath( './/div[@class="title"]/a[1]/text()').extract()[0] job_crawler_item.job_details_link = detail_item.xpath( './/div[@class="title"]/a[1]/@href').extract()[0] job_crawler_item.job_country = 'Singapore' job_crawler_item.job_location = 'Singapore' job_crawler_item.publish_date = re.search( r'.*([0-9]{4}-[0-9]{2}-[0-9]{2}).*', detail_item.xpath('.//div[@class="attr"]/text()[2]').extract() [0], re.M).group(1).strip() #Convert to the datetime format job_crawler_item.publish_date = datetime.datetime.strptime( job_crawler_item.publish_date, '%Y-%m-%d' ) if job_crawler_item.publish_date is not None else None job_crawler_item.salary = detail_item.xpath( './/div[@class="attr"]/text()[4]').extract()[0].replace( ',', '').strip() job_crawler_item.source = self.name job_crawler_item.crawled_date = datetime.datetime.now() except Exception as e: print e def retrieve_job_details(self, response): job_crawler_item = response.meta['item'] try: job_crawler_item.job_desc = response.xpath( '/html/head/meta[@name="description"]/@content').extract()[0] job_crawler_item.contact = response.xpath( '//div[@id="article-body"]/div[@class="attr"]/text()[3]' ).extract()[0].replace('\n', '').strip() except Exception as e: print e yield job_crawler_item
def _compile_rule(self, rule_dict): extractor = SgmlLinkExtractor2(allow=rule_dict['allow'], check_url=rule_dict.get( 'check_url', True)) rule = Rule(extractor) def get_method(method): if callable(method): return method elif isinstance(method, basestring): return getattr(self, method, None) else: return None rule.process_links = get_method(rule_dict.get('process_links')) # set default link type to leaf rule.link_type = rule_dict.get('link_type', LinkType.LEAF) return rule
class SingxinSpider(BaseSpider): name = "singxin" allowed_domains = ["singxin.com"] start_urls = ('http://www.singxin.com/category/view/id/47', ) rules = (Rule( LinkExtractor(allow='/category/view/id/47/page/[0-1]'), callback='parse_item', follow=True, ), ) def parse_start_url(self, response): return self.parse_item(response) def parse_item(self, response): return self.parse_item_requests_callback(response, '//div[@class="listCell"]') def populate_job_crawler_item(self, detail_item, job_crawler_item): try: job_crawler_item.job_title = detail_item.xpath( './/a[@class="title"]/text()').extract()[0] job_crawler_item.job_details_link = 'http://www.singxin.com' + detail_item.re( r'<a.*href="(/info/view/id/[0-9]+)">.*</a>')[0] job_crawler_item.job_country = 'Singapore' job_crawler_item.job_location = 'Singapore' job_crawler_item.contact = detail_item.re( r'<a.*href="tel:(.*)">.*</a>')[0] job_crawler_item.source = self.name job_crawler_item.crawled_date = datetime.datetime.now() except Exception as e: print e def retrieve_job_details(self, response): job_crawler_item = response.meta['item'] try: job_crawler_item.job_desc = response.xpath( '/html/head/meta[@name="description"]/@content').extract()[0] job_crawler_item.publish_date = response.selector.re( '<td><i class="icon-calendar icon-small"></i>(.*)</td>' )[0].replace(' ', '') #Convert to the datetime format job_crawler_item.publish_date = datetime.datetime.strptime( job_crawler_item.publish_date, '%Y-%m-%d' ) if job_crawler_item.publish_date is not None else None except Exception as e: print e yield job_crawler_item
class ShichengBBSSpider(BaseSpider): name = "shichengbbs" allowed_domains = ["shichengbbs.com"] start_urls = ('http://www.shichengbbs.com/category/view/id/47', ) rules = (Rule( LinkExtractor(allow='/category/view/id/47/page/[0-2]'), callback='parse_item', follow=True, ), ) def parse_start_url(self, response): return self.parse_item(response) def parse_item(self, response): return self.parse_item_requests_callback( response, '//div[@class="listCell row-fluid"]') def populate_job_crawler_item(self, detail_item, job_crawler_item): try: job_crawler_item.job_title = detail_item.xpath( './div[1]/a/text()').extract()[0] job_crawler_item.job_details_link = 'http://www.shichengbbs.com' + \ detail_item.re(r'<a.*href="(/info/view/id/[0-9]+)">.*</a>')[0] job_crawler_item.publish_date = \ detail_item.re(r'(.*)<span.*</span> <i class="icon-phone-sign icon-small"></i>')[0].replace('\t', '') # Convert to the datetime format job_crawler_item.publish_date = self.derieve_date_from_short_date_string( job_crawler_item.publish_date ) if job_crawler_item.publish_date is not None else None job_crawler_item.job_country = 'Singapore' job_crawler_item.job_location = 'Singapore' job_crawler_item.contact = detail_item.xpath( './div[2]/a/text()').extract()[0] job_crawler_item.source = self.name job_crawler_item.crawled_date = datetime.datetime.now() except: pass def retrieve_job_details(self, response): job_crawler_item = response.meta['item'] try: job_crawler_item.job_desc = \ response.xpath('/html/head/meta[@name="description"]/@content').extract()[0] except: pass yield job_crawler_item
class OGLOBOSpider(CrawlSpider): name = OGLOBO_SPIDER_NAME allowed_domains = [OGLOBO_DOMAIN] start_urls = OGLOBO_START_URLS rules = [ Rule( SgmlLinkExtractor(allow=(OGLOBO_URL_PATTERN), ), callback='parse_item', follow=True, ), ] def parse_item(self, response): sel = Selector(response) article = ArticleItem() article['source'] = 'O Globo' article['url'] = response.url title = sel.xpath(OGLOBO_ARTICLE_TITLE).extract() article['title'] = title[0] if title else None pub_date = sel.xpath(OGLOBO_ARTICLE_PUB_DATE).extract() pub_date = pub_date[0].replace('T', ' ') article['pub_date'] = datetime.strptime(pub_date, "%Y-%m-%d %H:%M") content = ' '.join(sel.xpath(OGLOBO_ARTICLE_CONTENT).extract()) article['body'] = content if content else None links = sel.xpath('//article//a/@href').extract() links = list(set(links)) try: links.remove('javascript:;') except Exception: pass article['links'] = links return article
class restaurantSpider(CrawlSpider): name = "restaurantSpider" allowed_domains = ["www.yelp.com",] # start_urls = [ # "http://www.yelp.com/search?find_desc=chinese+restaurant&find_loc=San+Francisco%2C+CA&ns=1&start=0&sortby=rating&l=g:-122.530517578,37.6859939294,-122.325897217,37.8488325065", # ] start_urls = [line for line in open("yelpCrawler/seeds/restaurant.txt")] rules = [ Rule(SgmlLinkExtractor(allow=(pattern, ), restrict_xpaths=('//ul[@class="pagination-links"]')), follow=True, callback='parse_restaurant'), # Rule(SgmlLinkExtractor(allow=(r"/search\?.*start=\d+")), follow=True, callback='parse_restaurant'), ] def parse_restaurant(self, response): items = [] print(response.url) sel = Selector(response) result_list = sel.css('div.search-results-content ul.ylist.ylist-bordered.search-results div.natural-search-result') # add this to retry another proxy if not result_list: log.msg("Retrying with " + response.url, level=log.INFO) yield Request(url=response.url, dont_filter=True) else: log.msg("Crawled "+response.url, level=log.INFO) for element in result_list: item = YelpcrawlerItem() item['name'] = clear_html_tag(trim_and_join(element.css('h3.search-result-title a.biz-name').extract())) # print(type(element.css('h3.search-result-title a.biz-name').xpath('text()').extract())) item['rating'] = trim_and_join(element.css('div.rating-large i.star-img').xpath('@title').extract()).split()[0] item['review_count'] = trim_and_join(element.css('span.review-count').xpath('text()').extract()).split()[0] # print(type(element.css('span.review-count').xpath('text()').extract())) item['price_range'] = trim_and_join(element.css('div.price-category span.business-attribute.price-range').xpath('text()').extract()).count("$") location = element.css('div.secondary-attributes address').xpath('text()').extract() item['phone'] = trim_and_join(element.css('div.secondary-attributes span.biz-phone').xpath('text()').extract()) parse_location(location, item) # print(type(element.css('div.secondary-attributes address').xpath('text()').extract())) # items.append(item) yield item
class TestSpider(DailyDealSpider): country = 'us' name = 'test_spider' main_domain = 'dailysteals.com' allowed_domains = [main_domain] main_url = 'http://www.dailysteals.com/' rules = ( # Deals Rule(SgmlLinkExtractor( restrict_xpaths='//h4[contains(@class,"product-title")]', process_value=lambda url: ensure_protocol(url)), callback='get_item', follow=False), ) has_main_offer = False decimal_mark = DECIMAL_MARK_PERIOD price_currency_xpath = '//div[@itemprop="price"]//text()' extractors = { F_ID: Extractor(xpath='//a[contains(@class,"btn-cart")]/@data-itemid'), F_OFFER: Extractor(xpath='//h1[@itemprop="name"]//text()'), F_DISCOUNT: Extractor(xpath='//dl[@class="discount"]//text()'), F_SOLD: Extractor(xpath='//div[contains(@class, "coupons-bought")]/text()'), F_DESC: Extractor(xpath='//div[@class="merchant-content"]//text()'), F_CITY: Extractor(xpath='//div[@class="zone"]//text()'), F_M_NAME: Extractor(xpath='//div[@class="side-merchant"]/span/b/text()'), F_M_WEBSITE: Extractor(xpath='//div[@class="side-merchant"]//a/@href'), F_M_ADDRESS: Extractor(xpath='//div[@class="adress-info"]//text()'), # F_M_LAT: Extractor(xpath='//div[@class="adress-info"]/img/@src', # fn=lambda matches, r, s: # matches[0].split('%7C')[1].split(',')[0]), # F_M_LON: Extractor(xpath='//div[@class="adress-info"]/img/@src', # fn=lambda matches, r, s: # matches[0].split('%7C')[1].split(',')[1]) }
class SaveMoneyIndia(CrawlSpider): name = "savemoney" allowed_domains = ["savemoneyindia.com"] categories = { "http://www.savemoneyindia.com/category/computers-laptops/": "Computer & Laptops", "http://www.savemoneyindia.com/category/mobiles/": "Mobiles", "http://www.savemoneyindia.com/category/mobile-dth-data-card-recharge/": "Recharge", "http://www.savemoneyindia.com/category/clothing-shoes-bags-lifestyle/": "Clothing", "http://www.savemoneyindia.com/category/footwear/": "Footwear", "http://www.savemoneyindia.com/category/electronics-gadgets/": "Electronics", } start_urls = categories.keys() rules = (Rule(SgmlLinkExtractor( allow=(r'www.savemoneyindia.com\/category.*\/page\/\d+\/', )), callback='parse_start_url', follow=True), ) def parse_start_url(self, response): items = [] for selection in response.xpath("//div[contains(@id, 'post-')]"): item = ContentItem() item['title'] = selection.xpath( "h2[@class='entry-title']/a/text()").extract() item['link'] = selection.xpath( "div[@class='entry']//a/@href").extract() item['desc'] = selection.xpath( "div[@class='entry']/p/text()").extract() item["category"] = key_lookup(self.categories, response.url) items.append(item) return items
class SaveMoneyIndia(CrawlSpider): name = "freekamall" allowed_domains = ["freekamall.com"] start_urls = ["http://freekaamaal.com/"] rules = (Rule(SgmlLinkExtractor(allow=(r'freekaamaal.*\/page\/\d+\/', )), callback='parse_start_url', follow=True), ) def parse_start_url(self, response): items = [] for selection in response.xpath("//div[@class='contentpagination']"): item = ContentItem() item['title'] = selection.xpath( "div[@class='exp_detail']/a/h2/text()").extract() item['link'] = selection.xpath("a[@class='shpnw']/@href").extract() item['desc'] = selection.xpath( "div[@class='exp_detail']/span/text()").extract() item["category"] = "" items.append(item) return items
class HLSpider(CrawlSpider): name = "hl" allowed_domains = ["hl.co.uk"] start_urls = [ "http://www.hl.co.uk/funds/fund-discounts,-prices--and--factsheets/search-results?is150=true" # "http://www.hl.co.uk/funds/fund-discounts,-prices--and--factsheets/search-results" ] rules = [ Rule( SgmlLinkExtractor(allow=[ '/funds/fund-discounts,-prices--and--factsheets/search-results/' ], deny=['charts$', 'invest$', 'tab=']), 'parse_fund') ] def parse_fund(self, response): x = HtmlXPathSelector(response) fund = HlscraperItem() fund['Url'] = response.url fund['Name'] = x.select( "normalize-space(/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/text())" ).extract() fund['ExdividendDate'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Ex-dividend date')]]/../td/text())" ).extract() fund['PaymentDate'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Payment date')]]/../td/text())" ).extract() fund['RunningYield'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Running yield')]]/../td/text())" ).extract() fund['HistoricYield'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Historic yield')]]/../td/text())" ).extract() fund['IncomePaid'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Income paid')]]/../td/text())" ).extract() fund['TypeOfPayment'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Type of payment')]]/../td/text())" ).extract() fund['LaunchDate'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Launch date')]]/../td/text())" ).extract() fund['Sector'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Sector')]]/../td/text())" ).extract() fund['FundSize'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Fund size')]]/../td/text())" ).extract() fund['NumberOfHoldings'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Number of holdings')]]/../td/text())" ).extract() fund['TypeOfUnits'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Type of units')]]/../td/text())" ).extract() fund['FundType'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Fund type')]]/../td/text())" ).extract() fund['NetInitialCharge'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Net initial charge')]]/../td/text())" ).extract() fund['NetAnnualCharge'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Net Annual charge')]]/../td/text())" ).extract() fund['OtherExpenses'] = x.select( "normalize-space(//tr/th[text()[contains(., \"Fund manager's other expenses\")]]/../td/text())" ).extract() fund['PerformanceFee'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Performance fee')]]/../td/text())" ).extract() fund['PlatformFee'] = x.select( "normalize-space(//tr/th[text()[contains(., 'HL Platform charge')]]/../td/text())" ).extract() fund['Wealth150'] = x.select( "/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/a/img/@src" ).extract() return fund
class WarAlbumSpider(CrawlSpider): checker = MongoChecker() name = 'war' description_xpath = '//*[@id="mcont"]/div/div[2]/div[4]/div[{0}]/div[2]/div[1]/text()' description_xpath0 = '//*[@id="mcont"]/div/div[2]/div[3]/div[{0}]/div[2]/div[1]/text()' image_xpath = '//*[@id="mcont"]/div/div[2]/div[4]/div[{0}]/div[2]/div[2]/div/a/img/@data-src_big' image_xpath0 = '//*[@id="mcont"]/div/div[2]/div[3]/div[{0}]/div[2]/div[2]/div/a/img/@data-src_big' post_link_xpath0 = '//*[@id="mcont"]/div/div[2]/div[3]/div[{0}]/a/@name' post_link_xpath = '//*[@id="mcont"]/div/div[2]/div[4]/div[{0}]/a/@name' page_name = 'page{0}.html' post_link_prefix = 'http://vk.com/waralbum?w=wall-' album_path = 'album' photo_name = 'photo{0}.jpg' allowed_domains = ['vk.com'] start_urls = ['https://m.vk.com/waralbum'] rules = [ Rule( SgmlLinkExtractor(restrict_xpaths=('//a[@class="show_more"]')), callback='parse_public', follow=True, ) ] counter_pages = 1 counter_posts = 0 def parse_start_url(self, response): hxs = Selector(response) # self.save_page(response.body) return self.parse_posts(5, hxs, self.description_xpath0, self.image_xpath0, self.post_link_xpath0) def parse_public(self, response): hxs = Selector(response) # self.save_page(response.body) self.counter_pages += 1 return self.parse_posts(10, hxs, self.description_xpath, self.image_xpath, self.post_link_xpath) def parse_posts(self, amount, selector, description_xpath, image_xpath, post_link_xpath): posts = [] for i in range(1, amount + 1): descr = selector.xpath(description_xpath.format(i)).extract() image_tmp_url = selector.xpath(image_xpath.format(i)).extract() description = '' if len(descr) > 0: description = descr[0] image_urls = [] for img in image_tmp_url: image_urls.append(img.split('|')[0]) if len(description) == 0 or len(image_urls) == 0: break post_link = self.post_link_prefix + selector.xpath( post_link_xpath.format(i)).extract()[0].split('-')[1] if self.checker.check(post_link): raise CloseSpider('Shutdown. New posts: {0}'.format( self.counter_posts)) local_images = [] for url in image_urls: photo_file = self.photo_name.format(uuid.uuid4()) urllib.urlretrieve(url, self.album_path + '/' + photo_file) local_images.append(photo_file) post = WaralbumPost() post['img_links'] = image_urls post['description'] = description post['post_link'] = post_link post['local_images'] = local_images posts.append(post) self.counter_posts += 1 print description print image_urls print post_link return posts def save_page(self, content): with open(self.page_name.format(self.counter_pages), 'wb') as f: f.write(content)
class LivingSocialSpider(CrawlSpider): name = "boxOffice" allowed_domains = ["boxofficeindia.com"] login_page = "http://www.boxofficeindia.com/Years/years_detail/2012" start_urls = [ "http://www.boxofficeindia.com/Years/years_detail/2012" #login_page ] rules = ( #Rule(SgmlLinkExtractor(allow=('/Boxoffice',)),follow=True), #Rule(SgmlLinkExtractor(allow=('/Years/years_detail/2014',))), Rule(SgmlLinkExtractor(allow=('movie_detail')), callback='myparse', follow=True), ) mov_fields = { 'title': './/div[@class="title4"]/a/text()', 'rel_date': '//div[@id="detailed"]//span/ul/li[1]//td[2]/b/text()', 'genre': '//div[@id="detailed"]//span/ul/li[2]//td[2]/a/b/text()', 'run_time': '//div[@id="detailed"]//span/ul/li[3]//td[2]/b/text()', 'budget': '//div[@id="detailed"]//span/ul/li[4]//td[2]/b/text()', 'screens': '//div[@id="detailed"]//span/ul/li[5]//td[2]/b/text()', 'footfalls': '//div[@id="detailed"]//span/ul/li[6]//td[2]/b/text()', 'dis_share': '//div[@id="detailed"]//span/ul/li[7]//td[2]/b/text()', 'total_gross': '//div[@id="detailed"]//span/ul/li[8]//td[2]/b/text()', 'total_nett_gross': '//div[@id="detailed"]//span/ul/li[9]//td[2]/b/text()', #'link':'.//div[@class="details"]//td/b/text()' 'link': './/div[@id="detailed"]//span/ul/li[9]//td[2]/b/text()' } def start_requests(self): return self.init_request() def init_request(self): print 'init_request' return [Request(url=self.login_page, callback=self.login)] def login(self, response): print 'login' return FormRequest.from_response( response, formnumber=1, formdata={ 'loginUname': '*****@*****.**', 'loginUpass': '******' }, callback=self.check_login_response) def check_login_response(self, response): print "login response" if "Logout" in response.body: print "bitchP0Lease" for url in self.start_urls: yield self.make_requests_from_url(url) #return Request("http://www.boxofficeindia.com/Years/years_detail/2014",callback=self.Red) else: self.file = open('dump2.html', 'wb') self.file.write(response.body) print "bitchPLease" return # def parse(self,response): # sel = HtmlXPathSelector(response) # l = sel.select('//div[@class="images"]/a') # for i in l: # j = i.select('.//a') # for iter in j: # self.myparse(Request(iter.select( def myparse(self, response): print "myParse" selector = HtmlXPathSelector(response) # l = selector.select(self.deals_list_xpath) l = selector.select('//div[@id="detailed"]') ll = l.select('.//div[@class="title4"]/a/text()').extract() open(ll[0].strip() + '.html', 'wb').write(response.body) print ll[0].strip() for deal in l: #loader = XPathItemLoader(LivingSocialDeal(),selector=deal) loader = XPathItemLoader(MoviesClass(), selector=deal) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.default_output_processor = TakeFirst() for field, xpath in self.mov_fields.iteritems(): loader.add_xpath(field, xpath) x = deal.select(field).extract() yield loader.load_item()
class SgxinSpider(BaseSpider): name = "sgxin" allowed_domains = ["sgxin.com"] start_urls = ( 'http://www.sgxin.com/viewcat_job1.html', 'http://www.sgxin.com/viewcat_job2.html', 'http://www.sgxin.com/viewcat_job3.html', 'http://www.sgxin.com/viewcat_job4.html', 'http://www.sgxin.com/viewcat_job5.html', 'http://www.sgxin.com/viewcat_job6.html', 'http://www.sgxin.com/viewcat_job7.html', 'http://www.sgxin.com/viewcat_job8.html', 'http://www.sgxin.com/viewcat_job9.html', 'http://www.sgxin.com/viewcat_job10.html', ) rules = (Rule( LinkExtractor(allow='index\.php\?ct=job.*&md=browse&page=[0-1]&'), callback='parse_item'), ) def parse_start_url(self, response): return self.parse_item(response) def parse_item(self, response): requests = [] for job_item in response.xpath('//tr'): job_crawler_item = JobItem() for index, detail_item in enumerate(job_item.xpath('./td')): self.populate_job_crawler_item(index, detail_item, job_crawler_item) if index == 4: if self.should_load_details(job_crawler_item): requests.append( Request(url=job_crawler_item.job_details_link, callback=self.retrieve_job_details, meta={'item': job_crawler_item}, dont_filter=True)) return requests def populate_job_crawler_item(self, index, detail_item, job_crawler_item): if index == 0: self.populate_job_title(detail_item, job_crawler_item) elif index == 1: self.populate_salary(detail_item, job_crawler_item) elif index == 2: self.populate_employer_name(detail_item, job_crawler_item) elif index == 3: self.populate_job_location(detail_item, job_crawler_item) elif index == 4: self.populate_publish_date(detail_item, job_crawler_item) else: pass self.populate_job_country(detail_item, job_crawler_item) job_crawler_item.source = self.name job_crawler_item.crawled_date = datetime.datetime.now() def populate_job_title(self, detail_item, job_crawler_item): job_crawler_item.job_title = detail_item.re(r'<a.*>(.*)</a>')[0] job_crawler_item.job_details_link = 'http://www.sgxin.com/' + detail_item.re( r'<a.*href="(.*)">.*</a>')[0] def populate_salary(self, detail_item, job_crawler_item): job_crawler_item.salary = detail_item.xpath('./text()').extract()[0] def populate_employer_name(self, detail_item, job_crawler_item): job_crawler_item.employer_name = detail_item.xpath( './text()').extract()[0] def populate_job_location(self, detail_item, job_crawler_item): job_crawler_item.job_location = detail_item.xpath( './text()').extract()[0] def populate_job_country(self, detail_item, job_crawler_item): job_crawler_item.job_country = 'Singapore' def populate_publish_date(self, detail_item, job_crawler_item): job_crawler_item.publish_date = detail_item.xpath( './text()').extract()[0] # Convert to the datetime format # job_crawler_item.publish_date = datetime.datetime.strptime(datetime.datetime.now().strftime('%Y') + '-' + job_crawler_item.publish_date, '%Y-%m-%d') if job_crawler_item.publish_date is not None else None job_crawler_item.publish_date = self.derieve_date_from_short_date_string( job_crawler_item.publish_date ) if job_crawler_item.publish_date is not None else None def retrieve_job_details(self, response): job_crawler_item = response.meta['item'] try: job_crawler_item.job_desc = \ response.xpath('//blockquote/p').extract()[0][3:-4].replace('<br>', '\n').replace('<br/>', '\n') #to strip the <p></p> job_crawler_item.contact = response.xpath( '//*[@id="content"]/div/div[2]/span[9]/text()').extract()[0] except: pass yield job_crawler_item