class KetoSizeMe(spiders.CrawlSpider): name = 'keto-size-me' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['ketosizeme.com'] start_urls = ['https://ketosizeme.com/category/ketogenic-diet-recipes/'] rules = [ # Extract links for finding additional pages within recipe index, # e.g. https://ketosizeme.com/category/ketogenic-diet-recipes/page/2/ spiders.Rule( linkextractors.LinkExtractor( allow= r'https://ketosizeme.com/category/ketogenic-diet-recipes/page/\d+/' )), # Extract links for recipes. spiders.Rule(linkextractors.LinkExtractor( allow=r'https://ketosizeme.com/.+/$', restrict_xpaths='//main'), callback=callback_handler.process_callback, follow=False), ]
class RuledMeSpider(spiders.CrawlSpider): name = 'ruled-me' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['ruled.me'] start_urls = ['https://www.ruled.me/keto-recipes/'] rules = [ # Extract links for food category pages, # e.g. https://www.ruled.me/keto-recipes/breakfast/ spiders.Rule( linkextractors.LinkExtractor( allow=r'https://www.ruled.me/keto-recipes/\w+(\-\w+)*/$', restrict_xpaths='//div[@class="r-list"]')), # Extract links for finding additional pages within food category pages, # e.g. https://www.ruled.me/keto-recipes/dinner/page/2/ spiders.Rule( linkextractors.LinkExtractor(allow=( r'https://www.ruled.me/keto-recipes/\w+(\-\w+)*/page/\d+/'))), # Extract links for the actual recipes, # e.g. https://www.ruled.me/easy-keto-cordon-bleu/ spiders.Rule(linkextractors.LinkExtractor( allow=r'https://www.ruled.me/\w+(\-\w+)*/$', restrict_xpaths='//div[@id="content"]'), callback=callback_handler.process_callback, follow=False) ]
class QueenBs(spiders.CrawlSpider): name = 'queen-bs' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['queenbsincredibleedibles.com'] start_urls = ['http://queenbsincredibleedibles.com/category/keto/page/1/'] rules = [ # Extract links for finding additional keto recipe pages, # e.g. http://queenbsincredibleedibles.com/category/keto/page/2/ spiders.Rule( linkextractors.LinkExtractor( allow= r'http://queenbsincredibleedibles.com/category/keto/page/\d+/') ), # Extract links for recipes, # e.g. http://queenbsincredibleedibles.com/creamy-coconut-kale-sausage-soup/ spiders.Rule(linkextractors.LinkExtractor( allow=r'http://queenbsincredibleedibles.com/.*/$', deny=r'(category\/)|(ive-fallen-in-love-with-keto)'), callback=callback_handler.process_callback, follow=False) ]
class KetogasmSpider(spiders.CrawlSpider): name = 'ketogasm' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['ketogasm.com'] _url_format = ('https://ketogasm.com/recipe-index/?' 'fwp_recipes_filters=recipe&' 'fwp_paged=%d') start_urls = [ (_url_format % 1), (_url_format % 2), (_url_format % 3), (_url_format % 4), ] rules = [ # Extract links for recipes. spiders.Rule(linkextractors.LinkExtractor( allow=r'https://ketogasm.com/.*/$', restrict_xpaths='//div[@id="recipes-grid"]'), callback=callback_handler.process_callback, follow=False) ]
def __init__(self, *args, **kwargs): self.rules = ( spiders.Rule(SameBaseDomainLinkExtractor(allowed_domains=self.allowed_domains), callback=self._parse_contents, follow=True), ) logging.getLogger('scrapy.core.engine').setLevel(logging.INFO) logging.getLogger('scrapy.downloadermiddlewares.redirect').setLevel(logging.INFO) logging.getLogger('scrapy.spidermiddlewares.depth').setLevel(logging.INFO) # We must set up self.rules before calling super, since super calls _compile_rules(). super(AllStudiosScraper, self).__init__(*args, **kwargs)
class SitemapSpider(spiders.CrawlSpider): name = 'sitemap' old_releases = tuple([ "/%s" % old_release for old_release in [ 'austin', 'bexar', 'cactus', 'diablo', 'essex', 'folsom', 'grizzly', 'havana', 'icehouse', 'juno', 'kilo', 'liberty', 'mitaka' ] ]) rules = [ spiders.Rule(LinkExtractor( allow=[ r'.*\.html', r'.*\.pdf', r'.*\.xml', r'.*\.txt', r'.*/', ], deny=[r'/trunk/', r'/draft/', r'/api/', r'/juno/', r'/icehouse/']), follow=True, callback='parse_item') ] def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs): super(SitemapSpider, self).__init__(*args, **kwargs) self.domain = domain self.allowed_domains = [domain] self.start_urls = ['http://%s' % domain] for url in urls.split(','): if not url: continue self.start_urls.append(url) def parse_item(self, response): item = SitemapItem() item['loc'] = response.url path = urlparse.urlsplit(response.url).path if path.startswith(self.old_releases): # weekly changefrequency and lower priority for old files item['priority'] = '0.5' item['changefreq'] = 'weekly' else: # daily changefrequency and highest priority for current files item['priority'] = '1.0' item['changefreq'] = 'daily' if 'Last-Modified' in response.headers: timestamp = response.headers['Last-Modified'] else: timestamp = response.headers['Date'] lastmod = time.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z") item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod) return item
class Ketovale(spiders.CrawlSpider): name = 'ketovale' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['ketovale.com'] start_urls = ['https://www.ketovale.com/category/recipes/'] rules = [ # Extract links for finding additional recipe pages, # e.g. https://www.ketovale.com/category/recipes/page/3/ spiders.Rule( linkextractors.LinkExtractor( allow=r'https://www.ketovale.com/category/recipes/page/\d+/')), # Extract links for recipes. spiders.Rule(linkextractors.LinkExtractor( allow=r'https://www.ketovale.com/recipe/.*/$', restrict_xpaths='//h2[@class="entry-title"]'), callback=callback_handler.process_callback, follow=False), ]
class SugarFreeMom(spiders.CrawlSpider): name = 'sugar-free-mom' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['sugarfreemom.com'] start_urls = ['https://www.sugarfreemom.com/recipes/category/diet/keto/'] rules = [ # Extract links for finding additional recipe pages, # e.g. https://www.sugarfreemom.com/recipes/category/diet/keto/page/2/ spiders.Rule( linkextractors.LinkExtractor(allow=( r'sugarfreemom.com/recipes/category/diet/keto/page/\d+/'))), # Extract links for recipes. spiders.Rule(linkextractors.LinkExtractor( allow=r'sugarfreemom.com/recipes/[^\/]+/$', restrict_xpaths='//main'), callback=callback_handler.process_callback, follow=False), ]
class GreekGoesKetoSpider(spiders.CrawlSpider): name = 'greek-goes-keto' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['greekgoesketo.com'] start_urls = ['https://www.greekgoesketo.com/category/recipes/'] rules = [ # Extract links for finding additional recipe pages, # e.g. https://www.greekgoesketo.com/category/recipes/page/1/ spiders.Rule( linkextractors.LinkExtractor( allow= r'https://(.+\.)greekgoesketo.com/category/recipes/page/\d+/') ), # Extract links for recipes, spiders.Rule(linkextractors.LinkExtractor(restrict_css='main article'), callback=callback_handler.process_callback, follow=False), ]
class HeyKetoMamaSpider(spiders.CrawlSpider): name = 'hey-keto-mama' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['heyketomama.com'] start_urls = ['https://www.heyketomama.com/category/recipes/page/1/'] rules = [ # Extract links for finding additional recipe pages, # e.g. https://www.heyketomama.com/category/recipes/page/6/ spiders.Rule( linkextractors.LinkExtractor( allow=r'https://www.heyketomama.com/category/recipes/page/\d+/' )), # Extract links for recipes, # e.g. https://www.heyketomama.com/ten-minute-keto-nachos/ spiders.Rule(linkextractors.LinkExtractor( restrict_xpaths='//div[@class="entry-content"]'), callback=callback_handler.process_callback, follow=False), ]
class WholesomeYum(spiders.CrawlSpider): name = 'wholesome-yum' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['wholesomeyum.com'] start_urls = ['https://www.wholesomeyum.com/tag/keto/'] rules = [ # Extract links for finding additional recipe pages, # e.g. https://www.wholesomeyum.com/tag/keto/page/2/ spiders.Rule( linkextractors.LinkExtractor( allow=r'wholesomeyum.com/tag/keto/page/\d+/')), # Extract links for recipes. spiders.Rule(linkextractors.LinkExtractor(allow=[ r'wholesomeyum.com/[^\/]+/$', r'wholesomeyum.com/recipes/[^\/]+/$' ], restrict_xpaths='//main'), callback=callback_handler.process_callback, follow=False), ]
class LowCarbYum(spiders.CrawlSpider): name = 'low-carb-yum' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['lowcarbyum.com'] start_urls = ['https://lowcarbyum.com/recipes/'] rules = [ # Extract links for food category pages, # e.g. https://lowcarbyum.com/category/desserts/ spiders.Rule( linkextractors.LinkExtractor( allow=r'https://lowcarbyum.com/category/', deny=r'https://lowcarbyum.com/category/((reviews)|(articles))') ), # Extract links for recipes. spiders.Rule(linkextractors.LinkExtractor( allow=r'https://lowcarbyum.com/.+/$', restrict_xpaths='//header[@class="entry-header"]'), callback=callback_handler.process_callback, follow=False) ]
class GreekGoesKetoSpider(spiders.CrawlSpider): name = 'greek-goes-keto' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['greekgoesketo.com'] start_urls = ['https://greekgoesketo.com/category/recipes/'] rules = [ # Extract links for finding additional recipe pages, # e.g. https://greekgoesketo.com/category/recipes/page/1/ spiders.Rule( linkextractors.LinkExtractor( allow=r'https://greekgoesketo.com/category/recipes/page/\d+/') ), # Extract links for recipes, # e.g. https://www.heyketomama.com/ten-minute-keto-nachos/ spiders.Rule(linkextractors.LinkExtractor( allow=r'https://greekgoesketo.com/\d{4}/\d{2}/\d{2}/.+/', restrict_xpaths='//div[@class="content-block"]'), callback=callback_handler.process_callback, follow=False), ]
class YourFriendsJ(spiders.CrawlSpider): name = 'your-friends-j' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['yourfriendsj.com'] start_urls = ['http://yourfriendsj.com/recipe-library/'] rules = [ # Extract links for finding additional recipe pages, # e.g. http://yourfriendsj.com/tag/keto/page/2/ spiders.Rule( linkextractors.LinkExtractor( allow=r'yourfriendsj.com/recipe-library/\?paged=\d+')), # Extract links for recipes, # e.g. http://yourfriendsj.com/recipes/easy-guacamole-recipe/ spiders.Rule(linkextractors.LinkExtractor( allow=r'http://yourfriendsj.com/recipes/[^\/]*/$', restrict_xpaths='//article'), callback=callback_handler.process_callback, follow=False) ]
class SkinnyTaste(spiders.CrawlSpider): name = 'skinny-taste' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['skinnytaste.com'] start_urls = ['https://www.skinnytaste.com/recipes/keto/'] rules = [ # Extract links for finding additional recipe pages, # e.g. https://www.skinnytaste.com/recipes/keto/page/2/ spiders.Rule( linkextractors.LinkExtractor( allow=r'skinnytaste.com/recipes/keto/page/\d+/')), # Extract links for recipes. spiders.Rule(linkextractors.LinkExtractor( allow=[ r'skinnytaste.com/[^\/]+/$', ], restrict_xpaths='//div[@class="archives"]'), callback=callback_handler.process_callback, follow=False), ]
class StoreSpider(ss.CrawlSpider): name = "store" start_urls = [ 'https://www.microsoft.com/en-in/store/top-free/apps/pc', 'https://www.microsoft.com/en-in/store/top-free/games/mobile', 'https://www.microsoft.com/en-in/store/top-free/games/pc', 'https://www.microsoft.com/en-in/store/top-free/games/xbox', 'https://www.microsoft.com/en-in/store/top-free/apps/mobile' ] rules = (ss.Rule(LinkExtractor( allow=(), deny=(".*-1"), restrict_xpaths=("//a[contains(@aria-label,'next page')]")), callback='parse_item', follow=True), ) custom_settings = { 'ITEM_PIPELINES': { 'appstore.pipelines.AppstorePipeline': 300 } } def parse_start_url(self, response): return self.parse_item(response) def parse_item(self, response): #print(response.url) selected = Selector(response=response).xpath( '//div[contains(@class, "c-group f-wrap-items context-list-page")]' ) sections = selected.xpath( "//section[contains(@class,'m-product-placement-item f-size-medium context-app')]" ) # print(len(sections)) for section in sections: soup = BeautifulSoup(section.extract(), 'html.parser') try: item = AppstoreItem() item['name'] = soup.h3.text item['rating'] = soup.find('span', { 'itemprop': 'ratingValue' }).text item['url'] = urllib.parse.urljoin(response.url, soup.find('a')['href']) yield item except: pass
class KetovangelistKitchen(spiders.CrawlSpider): name = 'ketovangelist-kitchen' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['ketovangelistkitchen.com'] # Organize start URLs in descending order of category strength (e.g. muffins # should be categorized as "snack" not "eggs". start_urls = [ 'http://www.ketovangelistkitchen.com/indexes/recipes/appetizers/', 'http://www.ketovangelistkitchen.com/indexes/recipes/desserts/', 'http://www.ketovangelistkitchen.com/indexes/recipes/beverages/', 'http://www.ketovangelistkitchen.com/indexes/recipes/sides/', 'http://www.ketovangelistkitchen.com/indexes/recipes/snack/', 'http://www.ketovangelistkitchen.com/indexes/recipes/soup/', 'http://www.ketovangelistkitchen.com/indexes/recipes/sauces-dressings/', 'http://www.ketovangelistkitchen.com/indexes/recipes/casseroles/', 'http://www.ketovangelistkitchen.com/indexes/recipes/fat-bombs/', 'http://www.ketovangelistkitchen.com/indexes/recipes/dairy-free/', 'http://www.ketovangelistkitchen.com/indexes/recipes/kid-friendly/', 'http://www.ketovangelistkitchen.com/indexes/recipes/baked-goods/', 'http://www.ketovangelistkitchen.com/indexes/recipes/beef/', 'http://www.ketovangelistkitchen.com/indexes/recipes/chicken-turkey/', 'http://www.ketovangelistkitchen.com/indexes/recipes/chocolate/', 'http://www.ketovangelistkitchen.com/indexes/recipes/fish/', 'http://www.ketovangelistkitchen.com/indexes/recipes/pork/', 'http://www.ketovangelistkitchen.com/indexes/recipes/vegetables/', 'http://www.ketovangelistkitchen.com/indexes/recipes/nuts/', 'http://www.ketovangelistkitchen.com/indexes/recipes/eggs/', ] rules = [ # Extract links for recipes. spiders.Rule(linkextractors.LinkExtractor( restrict_xpaths='//div[@class="entry-content"]'), callback=callback_handler.process_callback, follow=False) ]
class DietDoctorSpider(spiders.CrawlSpider): name = 'diet-doctor' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['dietdoctor.com'] # TODO(mtlynch): Make this more flexible. It's now limited to only 40 pages # but it should just figure out which ones are present. I've adding Rules # for the Previous/Next links but they don't seem to work. _url_prefix = ('https://www.dietdoctor.com/low-carb/recipes' '?s=&st=recipe&lowcarb%5B%5D=keto&sp=') start_urls = [_url_prefix + str(i) for i in range(1, 40)] rules = [ # Extract links for recipes, # e.g. /recipes/green-onion-no-chile-chicken-enchiladas spiders.Rule(linkextractors.LinkExtractor( allow=r'https://www.dietdoctor.com/recipes/'), callback=callback_handler.process_callback, follow=False), ]
class KetoConnectSpider(spiders.CrawlSpider): name = 'ketoconnect' callback_handler = CallbackHandler( content_saver=persist.ContentSaver(_get_download_root())) allowed_domains = ['ketoconnect.net'] start_urls = [ 'https://www.ketoconnect.net/main-dishes/', 'https://www.ketoconnect.net/side-dishes/', 'https://www.ketoconnect.net/breakfasts/', 'https://www.ketoconnect.net/snacks/', 'https://www.ketoconnect.net/desserts/', 'https://www.ketoconnect.net/beverages/' ] rules = [ # Extract links for the actual recipes # e.g. https://www.ketoconnect.net/recipe/spicy-cilantro-dressing/ spiders.Rule(linkextractors.LinkExtractor(restrict_xpaths='//article'), callback=callback_handler.process_callback, follow=False), ]
class CrawlJobSpider(sp.CrawlSpider): name = "Crawl_Job" allow_domains = ['https://jobs.51job.com'] start_urls = ['https://jobs.51job.com/all/'] rules = ( sp.Rule(LinkExtractor(allow=(r'https://jobs.51job.com/all/p\d+'))), sp.Rule(LinkExtractor(allow=(r'https://jobs.51job.com/.*/\d+.*',)), callback='parse_Item'), ) def parse_Item(self, response): Job_item = CrawljobItem() Job_item['url'] = response.url Job_item['job_name'] = re.sub(r"\(职位编号.*\)",'',response.css("div.cn h1::attr(title)").extract()[0]) money = response.css("div.cn strong::text").extract() Job_item['salary'] = 0.0 Job_item['Low_salary'] = 0.0 Job_item['High_salary'] = 0.0 Job_item['average_salary'] = 0.0 factormo = 1.0 factordate = 1.0 if money: Job_item['salary'] = money text = money[0] if "千" in text: factormo = 1000.0 text = text.replace("千", "") elif "万" in text: factormo = 10000.0 text = text.replace("万", "") elif "元" in text: factormo = 1.0 text = text.replace("元", "") if "月" in text: factordate = 1.0 text = text.replace("月", "") elif "年" in text: factordate = 1.0 / 12.0 text = text.replace("年", "") elif "天" in text: factordate = 31.0 text = text.replace("天", "") elif "小时" in text: factordate = 8.0 * 31.0 text = text.replace("小时", "") text = text.replace("/","") mo = text.split('-') if len(mo) == 2: a = float(mo[0]) * factormo * factordate b = float(mo[1]) * factormo * factordate Job_item['Low_salary'] = a Job_item['High_salary'] = b Job_item['average_salary'] = (a + b) / 2.0 elif len(mo) == 1: a = float(mo[0]) * factormo * factordate Job_item['Low_salary'] = a Job_item['High_salary'] = a Job_item['average_salary'] = a info = response.css("div.cn p[class='msg ltype']::text").extract() numlist = len(info) company_address = info[0].replace('\n','').replace('\r','').replace('\t','').replace('\xa0','') Job_item['company_address'] = company_address.split("-")[0] Job_item['work_experience'] = info[1].replace('\xa0', '') Job_item['work_language'] = '' if numlist == 5: Job_item['education'] = info[2].replace('\xa0', '') Job_item['need_numbers'] = info[3].replace('\xa0', '') Job_item['release_time'] = info[4].replace('\xa0', '').replace('\t', '').replace("发布", '') Job_item['work_language'] = '普通话精通' elif numlist ==6 or numlist == 7: Job_item['education'] = info[2].replace('\xa0', '') Job_item['need_numbers'] = info[3].replace('\xa0', '') Job_item['release_time'] = info[4].replace('\xa0', '').replace('\t', '').replace("发布", '') Job_item['work_language'] = info[5].replace('\xa0', '').replace('\t', '') elif numlist == 4: Job_item['education'] = 'None' Job_item['need_numbers'] = info[2].replace('\xa0', '') Job_item['release_time'] = info[3].replace('\xa0', '').replace('\t','').replace("发布", '') Job_item['work_language'] = '普通话精通' else: print(numlist) company_name = response.css("div.com_msg > a > p::text").extract() if company_name is not None: Job_item['company_name'] = company_name company_type = response.css("div:nth-child(1) > div.com_tag > p:nth-child(1)::text").extract() company_size = response.css("div:nth-child(1) > div.com_tag > p:nth-child(2)::text").extract() company_business = response.css("div:nth-child(1) > div.com_tag > p:nth-child(3)::attr(title)").extract() if company_type: Job_item['company_type'] = company_type else: Job_item['company_type'] = "None" if company_size: Job_item['company_size'] = company_size else: Job_item['company_size'] = "None" if company_business: Job_item['company_business'] = company_business else: Job_item['company_business'] = "None" job_detail = response.css("div.tCompany_main > div:nth-child(1)").\ xpath("string(div)").extract()[0].replace('\t','').replace('\n','').replace('\r',' ') if job_detail: Job_item['job_detail'] = job_detail else: Job_item['job_detail'] = "None" job_catacategory = response.css("div.tCompany_main > div:nth-child(1) > div > div.mt10").xpath("string(p)").extract()[0].\ replace('\t','').replace('\n','').replace("职能类别:","").replace("\r"," ") if job_catacategory: Job_item['job_catacategory'] = job_catacategory else: Job_item['job_catacategory'] = "None" company_detail = response.css("div.tCompany_main > div:last-child").xpath("string(div)").extract()[0].replace('\t','').replace('\n','') if company_detail: Job_item['company_detail'] = company_detail else: Job_item['company_detail'] = "None" return Job_item
class CrunchbaseSpider(spiders.CrawlSpider): name = "crunchbase" # TODO: find out if pages with 416 status code is re-crawled or not! # handle_httpstatus_list = [416] def start_requests(self): urls = [] with open('urls.txt', 'rb') as urls_file: # Change encoding if necessary urls = [ line.strip() for line in urls_file.read().decode('utf16').splitlines() if line.strip() ] for url in urls: yield self.make_requests_from_url(url) rules = ( # Crawl and parse person spiders.Rule(LinkExtractor(allow=r'/person/.*', deny=r'/person/.*[/\.]'), callback='parse_person', follow=True), # Crawl organization spiders.Rule(LinkExtractor(allow=r'/organization/.*', deny=r'/organization/.*[/\.]'), callback='parse_organization', follow=True), # Crawl acquisitions table spiders.Rule(LinkExtractor(allow=r'/acquisitions$', deny=r'/app/search', restrict_css='.acquisitions'), callback='parse_acquisitions'), # Crawl employees spiders.Rule(LinkExtractor(allow=r'/people$', deny=r'/app/search', restrict_css='.people'), callback='parse_employees'), # Crawl competitors spiders.Rule(LinkExtractor(allow=r'/competitors$', restrict_css='.competitors'), callback='parse_competitors'), # Crawl partners spiders.Rule(LinkExtractor(allow=r'/partners$', restrict_css='.partners'), callback='parse_partners'), # Crawl advisors spiders.Rule(LinkExtractor(allow=r'/advisors$', restrict_css='.advisors'), callback='parse_advisors'), ) def parse_start_url(self, response): if response.url.find('/person/') >= 0: self.parse_person(response) elif response.url.find('/organization/') >= 0: self.parse_organization(response) else: raise Exception('Start url is neither person nor organization') """ NOTE: there might be field-specific processors under scraper/items.py """ def parse_person(self, response): loader = ItemLoader(item=Person(), response=response) loader.default_input_processor = processors.MapCompose( w3lib.html.remove_tags) loader.default_output_processor = processors.TakeFirst() loader.add_xpath('name', '//*[@id="profile_header_heading"]/a/text()') loader.add_value('url', response.url) loader.add_xpath( 'primary_role', '//*[@id="info-card-overview-content"]/div/dl/div/dd') # Fields expected: born, gender, location, website overview = response.xpath( '//*[@id="info-card-overview-content"]/div/dl/dt/text()') overview_loader = loader.nested_xpath( '//*[@id="info-card-overview-content"]/div/dl') for i in range(len(overview)): key = overview[i].extract() key = key[:key.find(':')].lower() try: overview_loader.add_xpath(key, 'dd[{}]/text()'.format(i + 1)) except KeyError as e: # Ignore if key is not in the Item's field pass loader.add_xpath('facebook', '(//a[contains(@class,"facebook")])[1]/@href') loader.add_xpath('twitter', '(//a[contains(@class,"twitter")])[1]/@href') loader.add_xpath('linkedin', '(//a[contains(@class,"linkedin")])[1]/@href') loader.add_xpath('description', '//*[@id="description"]/span/div') loader.add_css('current_jobs', '.current_job') loader.add_css('past_jobs', '.past_job') loader.nested_css('.advisory_roles').add_xpath('board_advisors', './/ul/li') loader.nested_css('table.investors').add_xpath( 'investments', './/tr[not(@class="thead")]') loader.nested_css('.education').add_xpath('education', './/ul/li') return loader.load_item() def parse_organization(self, response): loader = ItemLoader(item=Organization(), response=response) loader.default_input_processor = processors.MapCompose( w3lib.html.remove_tags) loader.default_output_processor = processors.TakeFirst() loader.add_xpath('name', '//*[@id="profile_header_heading"]/a/text()') loader.add_value('url', response.url) # loader.add_value('ipo_stock', None) # TODO! # TODO: supposed to get person url for founders! # Fields expected: headquarters, description, founders, categories, # website, founded (date), and aliases keys = response.css('div.definition-list').xpath('dt/text()') values = response.css('div.definition-list').xpath('dd') for i in range(len(keys)): key = keys[i].extract() key = key[:key.find(':')].lower() try: loader.add_value(key, values[i].extract()) except KeyError as e: # Ignore if key is not in the Item's field pass loader.add_xpath('facebook', '(//a[contains(@class,"facebook")])[1]/@href') loader.add_xpath('twitter', '(//a[contains(@class,"twitter")])[1]/@href') loader.add_xpath('linkedin', '(//a[contains(@class,"linkedin")])[1]/@href') yield loader.load_item() for item in self.parse_acquisitions(response): yield item for item in self.parse_employees(response): yield item for item in self.parse_competitors(response): yield item for item in self.parse_partners(response): yield item for item in self.parse_advisors(response): yield item def parse_acquisitions(self, response): company_url = response.xpath( '//*[@id="profile_header_heading"]/a/@href').extract_first() acq_selectors = response.css('div.acquisitions').xpath( './/tr[not(th)]') for sel in acq_selectors: loader = ItemLoader(item=Acquisition(), selector=sel) loader.default_input_processor = processors.MapCompose( w3lib.html.remove_tags) loader.default_output_processor = processors.TakeFirst() loader.add_value('focal_company_url', company_url) loader.add_xpath('date', 'td[1]/text()') loader.add_xpath('acquired_url', 'td[2]/a/@href') yield loader.load_item() def parse_employees(self, response): company_url = response.xpath( '//*[@id="profile_header_heading"]/a/@href').extract_first() employee_selector = response.css('div.people').xpath('.//ul/li') for sel in employee_selector: loader = ItemLoader(item=Employee(), selector=sel) loader.default_input_processor = processors.MapCompose( w3lib.html.remove_tags) loader.default_output_processor = processors.TakeFirst() loader.add_value('company_url', company_url) loader.add_xpath('person_url', './/h4/a/@href') loader.add_xpath('title', './/h5/text()') yield loader.load_item() def parse_competitors(self, response): company_url = response.xpath( '//*[@id="profile_header_heading"]/a/@href').extract_first() comp_selectors = response.css('div.competitors').xpath( './/ul/li//h4/a') for sel in comp_selectors: loader = ItemLoader(item=Competitor(), selector=sel) loader.default_input_processor = processors.MapCompose( w3lib.html.remove_tags) loader.default_output_processor = processors.TakeFirst() loader.add_value('focal_company_url', company_url) loader.add_xpath('competitor_url', '@href') yield loader.load_item() def parse_partners(self, response): company_url = response.xpath( '//*[@id="profile_header_heading"]/a/@href').extract_first() partner_selectors = response.css('div.partners').xpath( './/ul/li//h4/a') for sel in partner_selectors: loader = ItemLoader(item=Partner(), selector=sel) loader.default_input_processor = processors.MapCompose( w3lib.html.remove_tags) loader.default_output_processor = processors.TakeFirst() loader.add_value('focal_company_url', company_url) loader.add_xpath('partner_url', '@href') yield loader.load_item() def parse_advisors(self, response): company_url = response.xpath( '//*[@id="profile_header_heading"]/a/@href').extract_first() employee_selector = response.css('div.advisors').xpath('.//ul/li') for sel in employee_selector: loader = ItemLoader(item=BoardMember(), selector=sel) loader.default_input_processor = processors.MapCompose( w3lib.html.remove_tags) loader.default_output_processor = processors.TakeFirst() loader.add_value('company_url', company_url) loader.add_xpath('person_url', './/h4/a/@href') loader.add_xpath('title', './/h5/text()') yield loader.load_item()
class SitemapSpider(spiders.CrawlSpider): name = 'sitemap' MAINT_SERIES = ['newton', 'ocata', 'pike'] MAINT_RELEASES_PAT = re.compile('^/(' + '|'.join(MAINT_SERIES) + ')/') LATEST_PAT = re.compile('^/latest/') rules = [ spiders.Rule(LinkExtractor(allow=[ r'.*\.html', r'.*\.pdf', r'.*\.xml', r'.*\.txt', r'.*/', ], deny=[ r'/trunk/', r'/draft/', r'/austin/', r'/bexar/', r'/cactus/', r'/diablo/', r'/essex/', r'/folsom/', r'/grizzly/', r'/havana/', r'/icehouse/', r'/juno/', r'/kilo/', r'/liberty/', r'/mitaka/' ]), follow=True, callback='parse_item') ] def __init__(self, domain='docs.openstack.org', urls='', *args, **kwargs): super(SitemapSpider, self).__init__(*args, **kwargs) self.domain = domain self.allowed_domains = [domain] self.start_urls = ['http://%s' % domain] for url in urls.split(','): if not url: continue self.start_urls.append(url) def parse_item(self, response): item = SitemapItem() item['loc'] = response.url path = urlparse.urlsplit(response.url).path if self.MAINT_RELEASES_PAT.match(path): # weekly changefrequency and highest prio for maintained release item['priority'] = '1.0' item['changefreq'] = 'weekly' elif self.LATEST_PAT.match(path): # daily changefrequency and normal priority for current files item['priority'] = '0.5' item['changefreq'] = 'daily' else: # These are unversioned documents # daily changefrequency and highest priority for current files item['priority'] = '1.0' item['changefreq'] = 'daily' if 'Last-Modified' in response.headers: timestamp = response.headers['Last-Modified'] else: timestamp = response.headers['Date'] lastmod = time.strptime(timestamp, "%a, %d %b %Y %H:%M:%S %Z") item['lastmod'] = time.strftime("%Y-%m-%dT%H:%M:%S%z", lastmod) return item