def parse_item(self, response): item = Item() l = ItemLoader(item=item, response=response) for a in self.config["attributes"]: item.fields[a["name"]] = scrapy.Field() processors = [] if "processors" in a: for p in a["processors"]: if p == "join": processors.append(Join()) elif p == "strip": processors.append(MapCompose(str.strip)) kwargs = {} if "regex" in a: kwargs["re"] = a["regex"] l.add_css(a["name"], a["selector"], *processors, **kwargs) item.fields["url"] = scrapy.Field() l.add_value("url", response.url) return l.load_item()
def generate_item(self, data, refer_id): """将传入的字典类型的data数据转换成item""" item = Item() for key, value in data.items(): item.fields[key] = Field() item[key] = value return self.padding_item(item, refer_id)
def process_item( self, item: Item, spider: Spider) -> Item: """ Save the whole html page to a text file. Parameters ---------- item: Item. The scraped item, ie the full web page + meta data. spider: Spider. The spider, one per document type. Returns ------- out: Item. The input item, unscathed. """ __provider = ''.join(item.get( 'provider', ['none'])) __text = ''.join(item.get( 'text', [''])) __file_path = os.path.join( self._path, getattr(spider, 'name', 'default'), __provider + '.html') with open(__file_path, 'w') as __file: __file.write(__text) return item
def parse_art(self, response): """ part_art This function will extract data relevant for a art work. ('url', 'title', 'image', 'height', 'width', 'description') will be single valued. ('artist', 'path') can be a list. """ item = Item() item_loader = ItemLoader(item=item, response=response) items_list = ('url', 'title', 'image', 'height', 'width', 'description') for name in items_list: item.fields[name] = Field(output_processor=TakeFirst()) item.fields['artist'] = Field() item.fields['path'] = Field() item_loader.add_value('url', response.meta['url']) item_loader.add_xpath('artist', '//*[@id="content"]/h2/text()') item_loader.add_xpath('title', '//*[@id="content"]/h1/text()') item_loader.add_xpath('image', '//*[@id="body"]/img/@src', MapCompose(lambda x: urljoin(self.base_url, x))) item_loader.add_xpath( 'height', '//*[@id="content"]/dl/dd[3]/text()', MapCompose( lambda x: self.extract_physical_dimension(x, type='height'))) item_loader.add_xpath( 'width', '//*[@id="content"]/dl/dd[3]/text()', MapCompose( lambda x: self.extract_physical_dimension(x, type='width'))) item_loader.add_xpath('description', '//*[@id="content"]/div/p/text()') item_loader.add_value('path', response.meta['browse_path']) return item_loader.load_item()
def parse_review(self, response): Item = ItemLoader(Review(), response) Item.add_xpath('titulo', '//h1/text()') Item.add_xpath( 'calificacion', '//span[@class="side-wrapper side-wrapper hexagon-content"]/text()' ) yield Item.load_item()
def parse(self, response): item = Item() l = ItemLoader(item=item, response=response) for name, xpath in response.meta['fields'].items(): if xpath: item.fields[name] = Field() l.add_xpath(name, xpath) return l.load_item()
def parse(self, response): item = Item() l = ItemLoader(item=item, response=response) for name, xpath in response.meta['fields'].iteritems(): if xpath: # 动态添加新字段 item.fields[name] = Field() l.add_xpath(name, xpath, MapCompose(unicode.strip, unicode.title)) return l.load_item()
def parse(self, response): item = Item() l = ItemLoader(item=item, response=response) for name, xpath in response.meta['fields'].iteritems(): if xpath: item.fields[name] = Field() l.add_xpath(name, xpath) return l.load_item()
def __init__(self, items, spider): Item.__init__(self) for k, v in items.items(): self[k] = v self[SPIDER] = { 'name': spider.name, 'site': spider.conf_name, 'ctime': int(time.time()), }
def copy_item(from_item, to_item=None): if to_item is None: to_item = Item() if hasattr(from_item, 'fields'): to_item.fields = from_item.fields for key, value in from_item.items(): if key not in to_item: to_item.fields[key] = {} to_item[key] = value return to_item
def __init__(self, items, spider): Item.__init__(self) for k, v in items.items(): self[k] = v self[SPIDER] = {'name': spider.name, 'cfg': spider.config_file, 'ctime': int(time.time()), 'site': spider.config.get('name'), 'media_type': spider.config.get(MEDIA_TYPE)}
def parse_content(self, response): item = Item() I = StoreLoader(item=item, response=response) for name, xpath in response.meta['field'].iteritems(): if xpath: # 动态创建一个item item.fields[name] = Field() I.add_xpath(name, xpath) else: print '请添加对应的匹配规则!' yield I.load_item()
def parse_item(self, response): #it has the response of jobdescription url item = Item() # manually declaring an item which has ordered dict format item_container = ItemLoader(item=item, response=response) for name, xpath in response.meta['fields'].items(): #first row of a csv file will be treated as name and following rows as xpath if xpath: #intializing the item fields manually item.fields[name] = Field() #adding an xpath to itemloader to obtain final result item_container.add_xpath(name, xpath) #populating an item which has final response return item_container.load_item()
def test_process_spider_output_stats_legacy(self): # testing the subclass not handling stats works at runtime # (i.e. that trying to update stats does not trigger exception) class LegacyDeltaFetchSubClass(self.mwcls): def __init__(self, dir, reset=False, *args, **kwargs): super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) self.something = True self._create_test_db() mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url', meta={'deltafetch_key': 'key'}) result = [] self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), []) self.assertEqual(self.stats.get_stats(), {}) result = [ Request('http://url', meta={'deltafetch_key': 'key'}), Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) ] # stats should not be updated self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), [result[0]]) self.assertEqual(self.stats.get_value('deltafetch/skipped'), None) result = [Item(), "not a base item"] self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), result) self.assertEqual(self.stats.get_value('deltafetch/stored'), None)
def test_hs_ext_item_scraped(hs_ext): hs_ext._write_item = mock.Mock() item = Item() spider = Spider('test') hs_ext.item_scraped(item, spider) assert hs_ext._write_item.call_count == 1 assert hs_ext._write_item.call_args[0] == ({'_type': 'Item'}, )
def check_buzzwords(self, response): self.__class__.crawl_count += 1 crawl_count = self.__class__.crawl_count wordlist = [ "Creating", "Deploy", "COZMO", ] url = response.url contenttype = response.headers.get("content-type", "").decode('utf-8').lower() data = response.body.decode('utf-8') for word in wordlist: substrings = find_all_substrings(data, word) for pos in substrings: ok = False if not ok: self.__class__.words_found += 1 print(word + ";" + url + "") return Item()
def test_missmatched_wildcard(self): v1 = 1 v2 = 2 item = Item() args = ([(self.makeitem({'t': v1}), ), (self.makeitem({'t': v2}), )], ) feed(item, 'a', args, 'a__') self.assertEqual(isinstance(item['a'], list), True) self.assertEqual(len(item['a']), 2) self.assertEqual( isinstance(item['a'][0], tuple) and isinstance(item['a'][1], tuple), True) self.assertEqual( isinstance(item['a'][0][0], Item) and isinstance(item['a'][1][0], Item), True) self.assertEqual(item['a'][0][0]['t'], v1) self.assertEqual(item['a'][1][0]['t'], v2) feed(item, 'b__c', args, 'a__') self.assertEqual(isinstance(item['b'], list), True) self.assertEqual(len(item['b']), 2) self.assertEqual( isinstance(item['b'][0], Item) and isinstance(item['b'][1], Item), True) self.assertEqual( isinstance(item['b'][0]['c'], Item) and isinstance(item['b'][1]['c'], Item), True) self.assertEqual(item['b'][0]['c']['t'], v1) self.assertEqual(item['b'][1]['c']['t'], v2)
def check_buzzwords(self, response): self.__class__.crawl_count += 1 crawl_count = self.__class__.crawl_count ##### Change the words ########## wordlist = [ "phone", "hotel", "reservation", "booked", ] url = response.url contenttype = response.headers.get("content-type", "").decode('utf-8').lower() data = response.body.decode('utf-8') for word in wordlist: substrings = find_all_substrings(data, word) for pos in substrings: ok = False if not ok: self.__class__.words_found += 1 print(word + ";" + url + ";") return Item()
def parse_item(self, response, loop, fields): hxs = HtmlXPathSelector(response) self.macro.update({'URL': response.url}) for e in hxs.select(loop or '(//*)[1]'): loader = XPathItemLoader(item=Item(), selector=e) for k, v in fields.iteritems(): if 'value' in v: get_v_x = loader.get_value v_x = v.get('value') elif 'xpath' in v: get_v_x = loader.get_xpath v_x = v.get('xpath') else: log.msg(u'field [{}] should contains "value" or "xpath"'. format(k), level=log.WARNING) continue val = get_v_x(self.macro.expand(v_x), utils.convert_type(v.get('parse', {})), re=v.get('regex')) if not val and 'default' in v: val = self.macro.expand(v.get('default')) qry = v.get('filter', {}) if utils.filter_data(qry, val): loader.add_value(k, val) else: break else: yield loader.load_item()
def test_isinstance_check(self): class SubclassedBaseItem(BaseItem): pass class SubclassedItem(Item): pass self.assertTrue(isinstance(BaseItem(), BaseItem)) self.assertTrue(isinstance(SubclassedBaseItem(), BaseItem)) self.assertTrue(isinstance(Item(), BaseItem)) self.assertTrue(isinstance(SubclassedItem(), BaseItem)) # make sure internal checks using private _BaseItem class succeed self.assertTrue(isinstance(BaseItem(), _BaseItem)) self.assertTrue(isinstance(SubclassedBaseItem(), _BaseItem)) self.assertTrue(isinstance(Item(), _BaseItem)) self.assertTrue(isinstance(SubclassedItem(), _BaseItem))
def parse(self, response): """ 根据返回的 response 进行数据解析 :param response: scrapy 框架返回的响应 """ item = Item() item_loader = ItemLoader(item=item, selector=response) for field in self.model_xpath: item.fields[field] = Field() if 'model_url' in field: item_loader.add_value(field, response.url) else: item_loader.add_xpath(field, self.model_xpath[field]) item = self.format_item(item_loader.load_item()) yield item
def parse_item(self, response): sel = Selector(response) items = [] lists = sel.xpath('//tt[@class="i-emp"]').extract() for com in lists: item = Item() item['title'] = com items.append(item) return (items)
def test_no_deprecation_warning(self): """ Make sure deprecation warnings are NOT logged whenever BaseItem subclasses are used. """ class SubclassedItem(Item): pass with catch_warnings(record=True) as warnings: Item() SubclassedItem() _BaseItem() self.assertFalse(isinstance("foo", _BaseItem)) self.assertFalse(isinstance("foo", Item)) self.assertFalse(isinstance("foo", SubclassedItem)) self.assertTrue(isinstance(_BaseItem(), _BaseItem)) self.assertTrue(isinstance(Item(), Item)) self.assertTrue(isinstance(SubclassedItem(), SubclassedItem)) self.assertEqual(len(warnings), 0)
def build_item(self, page_element, parse_config, base_url): item = Item() item.fields['url'] = Field() item['url'] = base_url for key, xpath in parse_config.iteritems(): if not item.fields.has_key(key): item.fields[key] = Field() if xpath.startswith('!'): item[key] = xpath[1:] continue if xpath: text = ''.join(page_element.xpath(xpath)).strip() else: text = '' if 'image' in key or 'url' in key: text = urlparse.urljoin(base_url, text) item[key] = text return item
def parse_node(self, response, node): log.msg('Hi, this is a <%s> node!: %s' % (self.itertag, ''.join(node.extract()))) item = Item() item['id'] = node.select('@id').extract() item['name'] = node.select('name').extract() item['description'] = node.select('description').extract() return item
def test_iterate_spider_output(self): i = Item() r = Request('http://scrapytest.org') o = object() self.assertEqual(list(iterate_spider_output(i)), [i]) self.assertEqual(list(iterate_spider_output(r)), [r]) self.assertEqual(list(iterate_spider_output(o)), [o]) self.assertEqual(list(iterate_spider_output([r, i, o])), [r, i, o])
def parse(self, response): root = lxml.html.fromhtml(response) items = [] item = Item() item['url'] = response items.append(item) return items
def __init__(self, item = None): if item == None: Item.__init__(self) self['cid'] = 0 self['name'] = "" self['caseCode'] = "" self['cardNum'] = "" self['businessEntity'] = "" self['courtName']= "" self['areaName'] = "" self['gistId'] = "" self['regDate'] = "" self['gistUnit'] = "" self['duty'] = "" self['performance'] = "" self['disruptTypeName'] = "" self['publishDate'] = "" else: Item.__init__(self,item)
def parse_item(self, response): self.log('Hi, this is an item page! %s' % response.url) hxs = HtmlXPathSelector(response) item = Item() item['id'] = hxs.select('//td[@id="item_id"]/text()').re(r'ID: (\d+)') item['name'] = hxs.select('//td[@id="item_name"]/text()').extract() item['description'] = hxs.select( '//td[@id="item_description"]/text()').extract() return item
def find_words(self, response): url = response.url contenttype = response.headers.get("content-type", "").decode("utf-8").lower() data = response.body.decode("utf-8") for w in self.words: if w in data: print(w + ":" + url) self.words.remove(w) return Item()
def test_drop_item_as_single_value(self): value = 1 item = Item() output_pack = ((value, ), ) feed(item, 'a__b', output_pack, 'e__f') self.assertEqual(isinstance(item['a'], Item), value) self.assertEqual(item['a']['b'], value) feed(item, 'a__c', ((DropItem(), ), ), 'e__g') self.assertEqual(item['a'], None)
def test_dictitem_deprecation_warning(self): """Make sure the DictItem deprecation warning is not issued for Item""" with catch_warnings(record=True) as warnings: item = Item() self.assertEqual(len(warnings), 0) class SubclassedItem(Item): pass subclassed_item = SubclassedItem() self.assertEqual(len(warnings), 0)
def __init__(self, item=None): if item == None: Item.__init__(self) self["cid"] = 0 self["name"] = "" self["caseCode"] = "" self["cardNum"] = "" self["businessEntity"] = "" self["courtName"] = "" self["areaName"] = "" self["gistId"] = "" self["regDate"] = "" self["gistUnit"] = "" self["duty"] = "" self["performance"] = "" self["disruptTypeName"] = "" self["publishDate"] = "" else: Item.__init__(self, item)
def test_isinstance_check(self): class SubclassedBaseItem(BaseItem): pass class SubclassedItem(Item): pass with catch_warnings(): filterwarnings("ignore", category=ScrapyDeprecationWarning) self.assertTrue(isinstance(BaseItem(), BaseItem)) self.assertTrue(isinstance(SubclassedBaseItem(), BaseItem)) self.assertTrue(isinstance(Item(), BaseItem)) self.assertTrue(isinstance(SubclassedItem(), BaseItem)) # make sure internal checks using private _BaseItem class succeed self.assertTrue(isinstance(BaseItem(), _BaseItem)) self.assertTrue(isinstance(SubclassedBaseItem(), _BaseItem)) self.assertTrue(isinstance(Item(), _BaseItem)) self.assertTrue(isinstance(SubclassedItem(), _BaseItem))
def __init__(self, item=None): if item == None: Item.__init__(self) self["cid"] = 0 self["name"] = "" self["caseCode"] = "" self["age"] = "" self["sex"] = "" # self['focusNumber'] = "" self["cardNum"] = "" self["courtName"] = "" self["areaName"] = "" self["partyTypeName"] = "" self["gistId"] = "" self["regDate"] = "" self["gistUnit"] = "" self["duty"] = "" self["performance"] = "" self["disruptTypeName"] = "" self["publishDate"] = "" else: Item.__init__(self, item)
def __init__(self, item = None): if item == None: Item.__init__(self) self['cid'] = 0 self['name'] = "" self['caseCode'] = "" self['age'] = "" self['sex'] = "" #self['focusNumber'] = "" self['cardNum'] = "" self['courtName']= "" self['areaName'] = "" self['partyTypeName'] = "" self['gistId'] = "" self['regDate'] = "" self['gistUnit'] = "" self['duty'] = "" self['performance'] = "" self['disruptTypeName'] = "" self['publishDate'] = "" else: Item.__init__(self,item)
def __init__(self, **kargs): kargs.update(kind='bangumi') Item.__init__(self, **kargs)
def __init__(self, items, spider, **kwargs): Item.__init__(self) for k, v in items.items(): self[k] = v
class FocraSpider(Spider): name = 'focras' ''' To access scrapy's core API. basically can modify anything in the 'crawler' ''' @classmethod def from_crawler(cls, crawler, **kwargs): print "focras - from crawler" spider = cls(stats=crawler.stats, settings=crawler.settings, **kwargs) crawler.signals.connect(spider.stopped, signals.engine_stopped) crawler.signals.connect(spider.idle, signals.spider_idle) return spider def __init__(self, stats=None, settings=None, **kwargs): super(FocraSpider, self).__init__(**kwargs) try: self.start_time = time.time() print 'focras init(' + self.cname + ') kwargs seeds ' + kwargs.get('seeds') print 'focras init(' + self.cname + ') kwargs template '+ self.template self.queue = Queue.Queue() self.queue_counter = 0 self.queue_reload_counter = 0 # to save the state of the pagination self.next_page_link = None self.end_of_data = False self.template = json.loads(self.template, object_pairs_hook=collections.OrderedDict) self.item = Item() self.pager = HTMLParser().unescape(self.pager) self.base_url = kwargs.get('seeds').split(',') self.crawled_pages = 0 self.status = None self.lcam = None # non chain crawler dont have a queue, check for pager only # chain crawler url does not start with http if self.base_url[0].startswith('http'): # for request_url of chain crawler self.parentname = None if self.runtype == 'resume' and self.pager != 'null': db = client['FocraDB'] collection = db['crawler'] cursor_focra = collection.find_one({'_id':self.cname}) self.base_url = [cursor_focra.get('next_page_link')] self.crawled_pages = cursor_focra.get('crawled_pages') self.start_time = self.start_time - cursor_focra.get('time_executed') client.close() print self.cname + " - Resume page is: " + self.base_url[0] self.start_urls = self.base_url else: print self.cname + " - Start page is: " + self.base_url[0] self.start_urls = self.base_url else: # chain crawler # get parent and field info from seeds self.parentname = self.base_url.pop() self.fieldname = self.base_url.pop() # connect using parent name and get first 100 of the field name self.crawler_db = settings['CRAWLER_DB'] db = client[self.crawler_db] collection = db[self.parentname] if self.runtype == 'resume': db_focra = client['FocraDB'] cursor_focra = db_focra['crawler'].find_one({'_id': self.cname}) self.queue_counter = cursor_focra.get('queue_counter') self.next_page_link = cursor_focra.get('next_page_link') self.crawled_pages = cursor_focra.get('crawled_pages') self.start_time = self.start_time - cursor_focra.get('time_executed') print self.cname + " - Loading Queue from " + str(self.queue_counter) cursor = collection.find({}, {self.fieldname: 1}).skip(self.queue_counter).limit(LINK_NUMBER) self.queue_reload_counter = self.queue_reload_counter + LINK_NUMBER + self.queue_counter else: cursor = collection.find({}, {self.fieldname: 1}).limit(LINK_NUMBER) # set the queue reload counter self.queue_reload_counter += LINK_NUMBER client.close() if cursor.count() <= self.queue_reload_counter: print self.cname + '- No more links to load' self.end_of_data = True # put it into queue for link in cursor: if link.get(self.fieldname): soup = BeautifulSoup(link.get(self.fieldname)) # to see the links added to queue #print soup.a['href'] self.queue.put(soup.a['href']) # if resume if self.next_page_link: self.base_url = [self.next_page_link] print self.cname + " - Resume page is: " + self.base_url[0] self.start_urls = self.base_url else: self.base_url = [self.queue.get()] if self.queue_counter == 0: self.queue_counter += 1 print self.cname + " - Start page is: " + self.base_url[0] else: print self.cname + " - Resume page is: " + self.base_url[0] self.start_urls = self.base_url except Exception as error: print error # interrupted state, crawler status determined by views.py # it is stopped or paused def stopped(self): try: if self.runtype != 'complete': print self.cname + " - Stopped" db = client['FocraDB'] collection = db['crawler'] # chain crawler queue from parent crawler if self.queue_counter != 0: collection.update({"_id": self.cname}, {"$set":{'queue_counter': self.queue_counter, 'crawled_pages': self.crawled_pages, 'time_executed': time.time() - self.start_time}}) print self.cname + " - Saved queue counter is: " + str(self.queue_counter) # main or chained crawler pager state if self.pager != 'null' and self.next_page_link: collection.update({"_id": self.cname}, {"$set":{'next_page_link': self.next_page_link, 'crawled_pages': self.crawled_pages, 'time_executed': time.time() - self.start_time}}) print self.cname + " - Saved Page link is: " + str(self.next_page_link) client.close() except Exception as err: print err # closed gracefully, crawler status complete def idle(self): try: # crawl completed if self.status == 'running': db = client['FocraDB'] collection = db['crawler'] collection.update({"_id": self.cname}, {"$set":{'crawlerAddr': '', 'crawlerStatus': 'completed', 'crawled_pages': self.crawled_pages, 'time_executed': time.time() - self.start_time}}) print self.cname + " - Crawl completed, closing gracefully" self.runtype = 'complete' client.close() except Exception as err: print err def parse(self, response): try: self.crawled_pages += 1 db = client['FocraDB'] db['crawler'].update({"_id": self.cname}, {"$set":{'crawled_pages': self.crawled_pages, 'time_executed': time.time()-self.start_time}}) print self.cname + " - Parsing items" body = BeautifulSoup(response.body) for tag in body.find_all('a', href=True): if 'http' not in tag['href']: tag['href'] = urljoin(self.base_url[0], tag['href']) for tag in body.find_all('img', src=True): if 'http' not in tag['src']: tag['src'] = urljoin(self.base_url[0], tag['src']) for t in body.find_all('tbody'): t.unwrap() response = response.replace(body=body.prettify(encoding='ascii')) dynamicItemLoader = ItemLoader(item=self.item, response=response) if self.parentname is not None: self.item.clear() self.item.fields['request_url'] = Field() dynamicItemLoader.add_value("request_url", response.url) ''' new codes ''' r = None d = {} for k, v in self.template.iteritems(): d[k] = v.split('/') lca = None if self.lcam: lca = self.lcam else: lca = self.longest_common_ancestor(d) self.lcam = lca print lca if lca: r = response.xpath(lca).extract() if r: if len(r) <= 1: for key, value in self.template.iteritems(): self.item.fields[key] = Field() dynamicItemLoader.add_xpath(key, value) else: for i in range(len(r)): # data region #print r[i].encode('ascii', 'ignore') sel = Selector(text=r[i]) for key, value in self.template.iteritems(): self.item.fields[key] = Field() #print self.get_xpath_tail(lca, value) x = sel.xpath(self.get_xpath_tail(lca, value)).extract() x = ''.join(x) if x.startswith('<a') or x.startswith('<img'): dynamicItemLoader.add_value(key, x) else: sb = "" for string in BeautifulSoup(x).stripped_strings: sb += "\n" + string dynamicItemLoader.add_value(key, sb) else: for key, value in self.template.iteritems(): #print value self.item.fields[key] = Field() dynamicItemLoader.add_xpath(key, value) print "yielded dynamic loader" yield dynamicItemLoader.load_item() # after scraping the page, check status to see whether we should stop self.status = db['crawler'].find_one({"_id":self.cname}).get('crawlerStatus') if self.status == 'stopped' or self.status == 'paused': raise CloseSpider('stopped') # check for pagination if self.pager != 'null': next_link = None # if the pager is in html format if bool(BeautifulSoup(self.pager, "html.parser").find()): # remove the \r for 'end of line' diff self.pager = self.pager.replace('\r', '') a_tags = response.xpath('//a').extract() for tag in a_tags: if self.pager in tag: tag = BeautifulSoup(tag) next_link = tag.a.get('href') break # if the pager is in text format else: if response.xpath('//a[text()[normalize-space()="'+ self.pager +'"]]/@href').extract(): next_link = response.xpath('//a[text()[normalize-space()="'+ self.pager +'"]]/@href').extract()[0] if next_link: self.next_page_link = next_link print self.cname + ' - Next page is: ' + self.next_page_link print "yielded request top" yield Request(self.next_page_link, callback=self.parse, dont_filter=True) else: # chained crawler WITH pagination # check for more links from parent column if not self.queue.empty(): k = self.queue.get() print "yielded request middle ---"+k yield Request(k, callback=self.parse, dont_filter=True) self.queue_counter += 1 if self.queue.qsize() <= LINK_NUMBER and self.end_of_data == False: self.check_queue() else: # chained crawler WITHOUT pagination # check for more links from parent column if not self.queue.empty(): l = self.queue.get() print "yielded request btm ---"+l yield Request(l, callback=self.parse, dont_filter=True) self.queue_counter += 1 if self.queue.qsize() <= LINK_NUMBER and self.end_of_data == False: self.check_queue() except Exception as err: print err def check_queue(self): try: print self.cname + '- Reload counter ' + str(self.queue_reload_counter) print self.cname + '- Queue less than ' + str(LINK_NUMBER) + ', querying for more links' db = client[self.crawler_db] collection = db[self.parentname] cursor = collection.find({}, {self.fieldname: 1}).skip(self.queue_reload_counter).limit(LINK_NUMBER) client.close() self.queue_reload_counter += LINK_NUMBER # cursor count returns the total row if cursor.count() <= self.queue_reload_counter: print self.cname + '- No more links to load' self.end_of_data = True # put it into queue for link in cursor: if link.get(self.fieldname): soup = BeautifulSoup(link.get(self.fieldname)) # uncomment below to see queue links #print soup.a['href'] self.queue.put(soup.a['href']) except Exception as err: print err ''' find the lowest common ancestor ''' def longest_common_ancestor(self, d): if len(d) < 1: return None p = None for l in d.values(): if len(l) < p or p is None: p = len(l) diff_index = None for i in range(p): check = None for v in d.itervalues(): if check is None or check == v[i]: check = v[i] elif check != v[i]: diff_index = i break if diff_index: break if diff_index: # return None if root note is '/body' which is 2 # return None if root note is '/html' which is 1 # return None if root note is '/' which is 0 if diff_index < 3: return None sb = "" for i in range(diff_index): if i != 0: sb += "/" + d.values()[0][i] return sb return None def get_xpath_tail(self, lca, value): last = lca.split("/") return '//' + re.sub('[^A-Za-z]+', '', last[len(last)-1]) + value.replace(lca, "", 1)
def __init__(self, items, ext=None): Item.__init__(self) for k, v in items.items(): self[k] = v self['ext'] = ext
def __init__(self, stats=None, settings=None, **kwargs): super(FocraSpider, self).__init__(**kwargs) try: self.start_time = time.time() print 'focras init(' + self.cname + ') kwargs seeds ' + kwargs.get('seeds') print 'focras init(' + self.cname + ') kwargs template '+ self.template self.queue = Queue.Queue() self.queue_counter = 0 self.queue_reload_counter = 0 # to save the state of the pagination self.next_page_link = None self.end_of_data = False self.template = json.loads(self.template, object_pairs_hook=collections.OrderedDict) self.item = Item() self.pager = HTMLParser().unescape(self.pager) self.base_url = kwargs.get('seeds').split(',') self.crawled_pages = 0 self.status = None self.lcam = None # non chain crawler dont have a queue, check for pager only # chain crawler url does not start with http if self.base_url[0].startswith('http'): # for request_url of chain crawler self.parentname = None if self.runtype == 'resume' and self.pager != 'null': db = client['FocraDB'] collection = db['crawler'] cursor_focra = collection.find_one({'_id':self.cname}) self.base_url = [cursor_focra.get('next_page_link')] self.crawled_pages = cursor_focra.get('crawled_pages') self.start_time = self.start_time - cursor_focra.get('time_executed') client.close() print self.cname + " - Resume page is: " + self.base_url[0] self.start_urls = self.base_url else: print self.cname + " - Start page is: " + self.base_url[0] self.start_urls = self.base_url else: # chain crawler # get parent and field info from seeds self.parentname = self.base_url.pop() self.fieldname = self.base_url.pop() # connect using parent name and get first 100 of the field name self.crawler_db = settings['CRAWLER_DB'] db = client[self.crawler_db] collection = db[self.parentname] if self.runtype == 'resume': db_focra = client['FocraDB'] cursor_focra = db_focra['crawler'].find_one({'_id': self.cname}) self.queue_counter = cursor_focra.get('queue_counter') self.next_page_link = cursor_focra.get('next_page_link') self.crawled_pages = cursor_focra.get('crawled_pages') self.start_time = self.start_time - cursor_focra.get('time_executed') print self.cname + " - Loading Queue from " + str(self.queue_counter) cursor = collection.find({}, {self.fieldname: 1}).skip(self.queue_counter).limit(LINK_NUMBER) self.queue_reload_counter = self.queue_reload_counter + LINK_NUMBER + self.queue_counter else: cursor = collection.find({}, {self.fieldname: 1}).limit(LINK_NUMBER) # set the queue reload counter self.queue_reload_counter += LINK_NUMBER client.close() if cursor.count() <= self.queue_reload_counter: print self.cname + '- No more links to load' self.end_of_data = True # put it into queue for link in cursor: if link.get(self.fieldname): soup = BeautifulSoup(link.get(self.fieldname)) # to see the links added to queue #print soup.a['href'] self.queue.put(soup.a['href']) # if resume if self.next_page_link: self.base_url = [self.next_page_link] print self.cname + " - Resume page is: " + self.base_url[0] self.start_urls = self.base_url else: self.base_url = [self.queue.get()] if self.queue_counter == 0: self.queue_counter += 1 print self.cname + " - Start page is: " + self.base_url[0] else: print self.cname + " - Resume page is: " + self.base_url[0] self.start_urls = self.base_url except Exception as error: print error
def __init__(self,other=None): Item.__init__(self)
def __init__(self, **kargs): kargs.update(kind='user') Item.__init__(self, **kargs)
def __init__(self): Item.__init__(self) cityid = 0 url = "" name = "" parentcityid = 0