Example #1
0
    def parse_item(self, response):
        item = Item()
        l = ItemLoader(item=item, response=response)
        
        for a in self.config["attributes"]:
            item.fields[a["name"]] = scrapy.Field()
            
            processors = []
            if "processors" in a:
                for p in a["processors"]:
                    if p == "join":
                        processors.append(Join())
                    elif p == "strip":
                        processors.append(MapCompose(str.strip))

            kwargs = {}
            if "regex" in a:
                kwargs["re"] = a["regex"]

            l.add_css(a["name"], a["selector"], *processors, **kwargs)
        
        item.fields["url"] = scrapy.Field()
        l.add_value("url", response.url)

        return l.load_item()
Example #2
0
 def generate_item(self, data, refer_id):
     """将传入的字典类型的data数据转换成item"""
     item = Item()
     for key, value in data.items():
         item.fields[key] = Field()
         item[key] = value
     return self.padding_item(item, refer_id)
Example #3
0
    def process_item(
            self,
            item: Item,
            spider: Spider) -> Item:
        """
        Save the whole html page to a text file.

        Parameters
        ----------
        item: Item.
            The scraped item, ie the full web page + meta data.
        spider: Spider.
            The spider, one per document type.

        Returns
        -------
        out: Item.
            The input item, unscathed.
        """
        __provider = ''.join(item.get(
            'provider',
            ['none']))
        __text = ''.join(item.get(
            'text',
            ['']))
        __file_path = os.path.join(
            self._path,
            getattr(spider, 'name', 'default'),
            __provider + '.html')

        with open(__file_path, 'w') as __file:
            __file.write(__text)

        return item
Example #4
0
    def parse_art(self, response):
        """
        part_art
            This function will extract data relevant for a art work.
            ('url', 'title', 'image', 'height', 'width', 'description') will be single valued.
            ('artist', 'path') can be a list.
        """
        item = Item()
        item_loader = ItemLoader(item=item, response=response)
        items_list = ('url', 'title', 'image', 'height', 'width',
                      'description')
        for name in items_list:
            item.fields[name] = Field(output_processor=TakeFirst())
        item.fields['artist'] = Field()
        item.fields['path'] = Field()

        item_loader.add_value('url', response.meta['url'])
        item_loader.add_xpath('artist', '//*[@id="content"]/h2/text()')
        item_loader.add_xpath('title', '//*[@id="content"]/h1/text()')
        item_loader.add_xpath('image', '//*[@id="body"]/img/@src',
                              MapCompose(lambda x: urljoin(self.base_url, x)))
        item_loader.add_xpath(
            'height', '//*[@id="content"]/dl/dd[3]/text()',
            MapCompose(
                lambda x: self.extract_physical_dimension(x, type='height')))
        item_loader.add_xpath(
            'width', '//*[@id="content"]/dl/dd[3]/text()',
            MapCompose(
                lambda x: self.extract_physical_dimension(x, type='width')))
        item_loader.add_xpath('description', '//*[@id="content"]/div/p/text()')
        item_loader.add_value('path', response.meta['browse_path'])
        return item_loader.load_item()
Example #5
0
 def parse_review(self, response):
     Item = ItemLoader(Review(), response)
     Item.add_xpath('titulo', '//h1/text()')
     Item.add_xpath(
         'calificacion',
         '//span[@class="side-wrapper side-wrapper hexagon-content"]/text()'
     )
     yield Item.load_item()
Example #6
0
 def parse(self, response):
     item = Item()
     l = ItemLoader(item=item, response=response)
     for name, xpath in response.meta['fields'].items():
         if xpath:
             item.fields[name] = Field()
             l.add_xpath(name, xpath)
     return l.load_item()
Example #7
0
 def parse(self, response):
     item = Item()
     l = ItemLoader(item=item, response=response)
     for name, xpath in response.meta['fields'].iteritems():
         if xpath:
             # 动态添加新字段
             item.fields[name] = Field()
             l.add_xpath(name, xpath, MapCompose(unicode.strip, unicode.title))
     return l.load_item()
Example #8
0
    def parse(self, response):
        item = Item()
        l = ItemLoader(item=item, response=response)
        for name, xpath in response.meta['fields'].iteritems():
            if xpath:
                item.fields[name] = Field()
                l.add_xpath(name, xpath)

        return l.load_item()
Example #9
0
    def __init__(self, items, spider):
        Item.__init__(self)
        for k, v in items.items():
            self[k] = v

        self[SPIDER] = {
            'name': spider.name,
            'site': spider.conf_name,
            'ctime': int(time.time()),
        }
Example #10
0
def copy_item(from_item, to_item=None):
    if to_item is None:
        to_item = Item()
    if hasattr(from_item, 'fields'):
        to_item.fields = from_item.fields
    for key, value in from_item.items():
        if key not in to_item:
            to_item.fields[key] = {}
        to_item[key] = value
    return to_item
Example #11
0
    def __init__(self, items, spider):
        Item.__init__(self)
        for k, v in items.items():
            self[k] = v

        self[SPIDER] = {'name': spider.name,
                        'cfg': spider.config_file,
                        'ctime': int(time.time()),
                        'site': spider.config.get('name'),
                        'media_type': spider.config.get(MEDIA_TYPE)}
Example #12
0
 def parse_content(self, response):
     item = Item()
     I = StoreLoader(item=item, response=response)
     for name, xpath in response.meta['field'].iteritems():
         if xpath:
             # 动态创建一个item
             item.fields[name] = Field()
             I.add_xpath(name, xpath)
         else:
             print '请添加对应的匹配规则!'
     yield I.load_item()
Example #13
0
    def parse_item(self, response):
        #it has the response of jobdescription url
        item = Item()
        # manually declaring an item which has ordered dict format
        item_container = ItemLoader(item=item, response=response)

        for name, xpath in response.meta['fields'].items():
            #first row of a csv file will be treated as name and following rows as xpath
            if xpath:
                #intializing the item fields manually
                item.fields[name] = Field()
                #adding an xpath to itemloader to obtain final result
                item_container.add_xpath(name, xpath)
        #populating an item which has final response
        return item_container.load_item()
    def test_process_spider_output_stats_legacy(self):
        # testing the subclass not handling stats works at runtime
        # (i.e. that trying to update stats does not trigger exception)
        class LegacyDeltaFetchSubClass(self.mwcls):
            def __init__(self, dir, reset=False, *args, **kwargs):
                super(LegacyDeltaFetchSubClass, self).__init__(dir=dir,
                                                               reset=reset)
                self.something = True

        self._create_test_db()
        mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False)
        mw.spider_opened(self.spider)
        response = mock.Mock()
        response.request = Request('http://url',
                                   meta={'deltafetch_key': 'key'})
        result = []
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)), [])
        self.assertEqual(self.stats.get_stats(), {})
        result = [
            Request('http://url', meta={'deltafetch_key': 'key'}),
            Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
        ]

        # stats should not be updated
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)),
            [result[0]])
        self.assertEqual(self.stats.get_value('deltafetch/skipped'), None)

        result = [Item(), "not a base item"]
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)),
            result)
        self.assertEqual(self.stats.get_value('deltafetch/stored'), None)
def test_hs_ext_item_scraped(hs_ext):
    hs_ext._write_item = mock.Mock()
    item = Item()
    spider = Spider('test')
    hs_ext.item_scraped(item, spider)
    assert hs_ext._write_item.call_count == 1
    assert hs_ext._write_item.call_args[0] == ({'_type': 'Item'}, )
    def check_buzzwords(self, response):

        self.__class__.crawl_count += 1

        crawl_count = self.__class__.crawl_count

        wordlist = [
            "Creating",
            "Deploy",
            "COZMO",
        ]

        url = response.url
        contenttype = response.headers.get("content-type",
                                           "").decode('utf-8').lower()
        data = response.body.decode('utf-8')

        for word in wordlist:
            substrings = find_all_substrings(data, word)
            for pos in substrings:
                ok = False
                if not ok:
                    self.__class__.words_found += 1
                    print(word + ";" + url + "")
        return Item()
Example #17
0
    def test_missmatched_wildcard(self):
        v1 = 1
        v2 = 2
        item = Item()
        args = ([(self.makeitem({'t': v1}), ), (self.makeitem({'t': v2}), )], )
        feed(item, 'a', args, 'a__')
        self.assertEqual(isinstance(item['a'], list), True)
        self.assertEqual(len(item['a']), 2)
        self.assertEqual(
            isinstance(item['a'][0], tuple)
            and isinstance(item['a'][1], tuple), True)
        self.assertEqual(
            isinstance(item['a'][0][0], Item)
            and isinstance(item['a'][1][0], Item), True)
        self.assertEqual(item['a'][0][0]['t'], v1)
        self.assertEqual(item['a'][1][0]['t'], v2)

        feed(item, 'b__c', args, 'a__')
        self.assertEqual(isinstance(item['b'], list), True)
        self.assertEqual(len(item['b']), 2)
        self.assertEqual(
            isinstance(item['b'][0], Item) and isinstance(item['b'][1], Item),
            True)
        self.assertEqual(
            isinstance(item['b'][0]['c'], Item)
            and isinstance(item['b'][1]['c'], Item), True)
        self.assertEqual(item['b'][0]['c']['t'], v1)
        self.assertEqual(item['b'][1]['c']['t'], v2)
Example #18
0
    def check_buzzwords(self, response):

        self.__class__.crawl_count += 1

        crawl_count = self.__class__.crawl_count

        ##### Change the words  ##########
        wordlist = [
            "phone",
            "hotel",
            "reservation",
            "booked",
        ]

        url = response.url
        contenttype = response.headers.get("content-type",
                                           "").decode('utf-8').lower()
        data = response.body.decode('utf-8')

        for word in wordlist:
            substrings = find_all_substrings(data, word)
            for pos in substrings:
                ok = False
                if not ok:
                    self.__class__.words_found += 1
                    print(word + ";" + url + ";")

        return Item()
Example #19
0
    def parse_item(self, response, loop, fields):
        hxs = HtmlXPathSelector(response)
        self.macro.update({'URL': response.url})

        for e in hxs.select(loop or '(//*)[1]'):
            loader = XPathItemLoader(item=Item(), selector=e)

            for k, v in fields.iteritems():
                if 'value' in v:
                    get_v_x = loader.get_value
                    v_x = v.get('value')
                elif 'xpath' in v:
                    get_v_x = loader.get_xpath
                    v_x = v.get('xpath')
                else:
                    log.msg(u'field [{}] should contains "value" or "xpath"'.
                            format(k),
                            level=log.WARNING)
                    continue

                val = get_v_x(self.macro.expand(v_x),
                              utils.convert_type(v.get('parse', {})),
                              re=v.get('regex'))

                if not val and 'default' in v:
                    val = self.macro.expand(v.get('default'))

                qry = v.get('filter', {})
                if utils.filter_data(qry, val):
                    loader.add_value(k, val)
                else:
                    break
            else:
                yield loader.load_item()
Example #20
0
    def test_isinstance_check(self):
        class SubclassedBaseItem(BaseItem):
            pass

        class SubclassedItem(Item):
            pass

        self.assertTrue(isinstance(BaseItem(), BaseItem))
        self.assertTrue(isinstance(SubclassedBaseItem(), BaseItem))
        self.assertTrue(isinstance(Item(), BaseItem))
        self.assertTrue(isinstance(SubclassedItem(), BaseItem))

        # make sure internal checks using private _BaseItem class succeed
        self.assertTrue(isinstance(BaseItem(), _BaseItem))
        self.assertTrue(isinstance(SubclassedBaseItem(), _BaseItem))
        self.assertTrue(isinstance(Item(), _BaseItem))
        self.assertTrue(isinstance(SubclassedItem(), _BaseItem))
Example #21
0
    def parse(self, response):
        """
        根据返回的 response 进行数据解析
        :param response: scrapy 框架返回的响应
        """
        item = Item()
        item_loader = ItemLoader(item=item, selector=response)
        for field in self.model_xpath:
            item.fields[field] = Field()
            if 'model_url' in field:
                item_loader.add_value(field, response.url)
            else:
                item_loader.add_xpath(field, self.model_xpath[field])

        item = self.format_item(item_loader.load_item())

        yield item
Example #22
0
 def parse_item(self, response):
   sel = Selector(response)
   items = []
   lists = sel.xpath('//tt[@class="i-emp"]').extract()
   for com in lists:
     item = Item()
     item['title'] = com
     items.append(item)
   return (items)
Example #23
0
    def test_no_deprecation_warning(self):
        """
        Make sure deprecation warnings are NOT logged whenever BaseItem subclasses are used.
        """
        class SubclassedItem(Item):
            pass

        with catch_warnings(record=True) as warnings:
            Item()
            SubclassedItem()
            _BaseItem()
            self.assertFalse(isinstance("foo", _BaseItem))
            self.assertFalse(isinstance("foo", Item))
            self.assertFalse(isinstance("foo", SubclassedItem))
            self.assertTrue(isinstance(_BaseItem(), _BaseItem))
            self.assertTrue(isinstance(Item(), Item))
            self.assertTrue(isinstance(SubclassedItem(), SubclassedItem))
            self.assertEqual(len(warnings), 0)
Example #24
0
 def build_item(self, page_element, parse_config, base_url):
     item = Item()
     item.fields['url'] = Field()
     item['url'] = base_url
     for key, xpath in parse_config.iteritems():
         if not item.fields.has_key(key):
             item.fields[key] = Field()
         if xpath.startswith('!'):
             item[key] = xpath[1:]
             continue
         if xpath:
             text = ''.join(page_element.xpath(xpath)).strip()
         else:
             text = ''
         if 'image' in key or 'url' in key:
             text = urlparse.urljoin(base_url, text)
         item[key] = text
     return item
Example #25
0
    def parse_node(self, response, node):
        log.msg('Hi, this is a <%s> node!: %s' %
                (self.itertag, ''.join(node.extract())))

        item = Item()
        item['id'] = node.select('@id').extract()
        item['name'] = node.select('name').extract()
        item['description'] = node.select('description').extract()
        return item
    def test_iterate_spider_output(self):
        i = Item()
        r = Request('http://scrapytest.org')
        o = object()

        self.assertEqual(list(iterate_spider_output(i)), [i])
        self.assertEqual(list(iterate_spider_output(r)), [r])
        self.assertEqual(list(iterate_spider_output(o)), [o])
        self.assertEqual(list(iterate_spider_output([r, i, o])), [r, i, o])
Example #27
0
    def parse(self, response):
        root = lxml.html.fromhtml(response)

        items = []

        item = Item()
        item['url'] = response
        items.append(item)
        
        return items
Example #28
0
 def __init__(self, item = None):
     if item == None:
         Item.__init__(self)
         self['cid'] = 0
         self['name'] = ""
         self['caseCode'] = ""
         self['cardNum'] = ""
         self['businessEntity'] = ""
         self['courtName']= ""
         self['areaName'] = ""
         self['gistId'] = ""
         self['regDate'] = ""
         self['gistUnit'] = ""
         self['duty'] = ""
         self['performance'] = ""
         self['disruptTypeName'] = ""
         self['publishDate'] = ""
     else:
         Item.__init__(self,item)
Example #29
0
    def parse_item(self, response):
        self.log('Hi, this is an item page! %s' % response.url)

        hxs = HtmlXPathSelector(response)
        item = Item()
        item['id'] = hxs.select('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
        item['name'] = hxs.select('//td[@id="item_name"]/text()').extract()
        item['description'] = hxs.select(
            '//td[@id="item_description"]/text()').extract()
        return item
 def find_words(self, response):
     url = response.url
     contenttype = response.headers.get("content-type",
                                        "").decode("utf-8").lower()
     data = response.body.decode("utf-8")
     for w in self.words:
         if w in data:
             print(w + ":" + url)
             self.words.remove(w)
     return Item()
Example #31
0
    def test_drop_item_as_single_value(self):
        value = 1
        item = Item()
        output_pack = ((value, ), )
        feed(item, 'a__b', output_pack, 'e__f')
        self.assertEqual(isinstance(item['a'], Item), value)
        self.assertEqual(item['a']['b'], value)

        feed(item, 'a__c', ((DropItem(), ), ), 'e__g')
        self.assertEqual(item['a'], None)
Example #32
0
 def test_dictitem_deprecation_warning(self):
     """Make sure the DictItem deprecation warning is not issued for
     Item"""
     with catch_warnings(record=True) as warnings:
         item = Item()
         self.assertEqual(len(warnings), 0)
         class SubclassedItem(Item):
             pass
         subclassed_item = SubclassedItem()
         self.assertEqual(len(warnings), 0)
Example #33
0
 def __init__(self, item=None):
     if item == None:
         Item.__init__(self)
         self["cid"] = 0
         self["name"] = ""
         self["caseCode"] = ""
         self["cardNum"] = ""
         self["businessEntity"] = ""
         self["courtName"] = ""
         self["areaName"] = ""
         self["gistId"] = ""
         self["regDate"] = ""
         self["gistUnit"] = ""
         self["duty"] = ""
         self["performance"] = ""
         self["disruptTypeName"] = ""
         self["publishDate"] = ""
     else:
         Item.__init__(self, item)
Example #34
0
    def test_isinstance_check(self):
        class SubclassedBaseItem(BaseItem):
            pass

        class SubclassedItem(Item):
            pass

        with catch_warnings():
            filterwarnings("ignore", category=ScrapyDeprecationWarning)
            self.assertTrue(isinstance(BaseItem(), BaseItem))
            self.assertTrue(isinstance(SubclassedBaseItem(), BaseItem))
            self.assertTrue(isinstance(Item(), BaseItem))
            self.assertTrue(isinstance(SubclassedItem(), BaseItem))

            # make sure internal checks using private _BaseItem class succeed
            self.assertTrue(isinstance(BaseItem(), _BaseItem))
            self.assertTrue(isinstance(SubclassedBaseItem(), _BaseItem))
            self.assertTrue(isinstance(Item(), _BaseItem))
            self.assertTrue(isinstance(SubclassedItem(), _BaseItem))
Example #35
0
 def __init__(self, item=None):
     if item == None:
         Item.__init__(self)
         self["cid"] = 0
         self["name"] = ""
         self["caseCode"] = ""
         self["age"] = ""
         self["sex"] = ""
         # self['focusNumber'] = ""
         self["cardNum"] = ""
         self["courtName"] = ""
         self["areaName"] = ""
         self["partyTypeName"] = ""
         self["gistId"] = ""
         self["regDate"] = ""
         self["gistUnit"] = ""
         self["duty"] = ""
         self["performance"] = ""
         self["disruptTypeName"] = ""
         self["publishDate"] = ""
     else:
         Item.__init__(self, item)
Example #36
0
 def __init__(self, item = None):
     if item == None:
         Item.__init__(self)
         self['cid'] = 0
         self['name'] = ""
         self['caseCode'] = ""
         self['age'] = ""
         self['sex'] =  ""
         #self['focusNumber'] = ""
         self['cardNum'] = ""
         self['courtName']= ""
         self['areaName'] = ""
         self['partyTypeName'] = ""
         self['gistId'] = ""
         self['regDate'] = ""
         self['gistUnit'] = ""
         self['duty'] = ""
         self['performance'] = ""
         self['disruptTypeName'] = ""
         self['publishDate'] = ""
     else:
         Item.__init__(self,item)
Example #37
0
 def __init__(self, **kargs):
     kargs.update(kind='bangumi')
     Item.__init__(self, **kargs)
Example #38
0
 def __init__(self, items, spider, **kwargs):
     Item.__init__(self)
     for k, v in items.items():
         self[k] = v
Example #39
0
class FocraSpider(Spider):
	name = 'focras'
	'''
	To access scrapy's core API. basically can modify anything in the 'crawler'
	'''
	@classmethod
	def from_crawler(cls, crawler, **kwargs):
		print "focras - from crawler"
		spider = cls(stats=crawler.stats, settings=crawler.settings, **kwargs)
		crawler.signals.connect(spider.stopped, signals.engine_stopped)
		crawler.signals.connect(spider.idle, signals.spider_idle)
		return spider
	
	def __init__(self, stats=None, settings=None, **kwargs):
		super(FocraSpider, self).__init__(**kwargs)
		try:
			self.start_time = time.time()
			print 'focras init(' + self.cname + ') kwargs seeds ' + kwargs.get('seeds')
			print 'focras init(' + self.cname + ') kwargs template '+ self.template
			self.queue = Queue.Queue()
			self.queue_counter = 0
			self.queue_reload_counter = 0
			# to save the state of the pagination
			self.next_page_link = None
			self.end_of_data = False
			self.template = json.loads(self.template, object_pairs_hook=collections.OrderedDict)
			self.item = Item()
			self.pager = HTMLParser().unescape(self.pager)
			self.base_url = kwargs.get('seeds').split(',')
			self.crawled_pages = 0
			self.status = None
			self.lcam = None
			
			# non chain crawler dont have a queue, check for pager only
			# chain crawler url does not start with http
			if self.base_url[0].startswith('http'):
				# for request_url of chain crawler
				self.parentname = None
				if self.runtype == 'resume' and self.pager != 'null':
					db = client['FocraDB']
					collection = db['crawler']
					cursor_focra = collection.find_one({'_id':self.cname})
					self.base_url = [cursor_focra.get('next_page_link')]
					self.crawled_pages = cursor_focra.get('crawled_pages')
					self.start_time = self.start_time - cursor_focra.get('time_executed')
					client.close()
					print self.cname + " - Resume page is: " + self.base_url[0]
					self.start_urls = self.base_url
				else:
					print self.cname + " - Start page is: " + self.base_url[0]
					self.start_urls = self.base_url
			else:
				# chain crawler
				# get parent and field info from seeds
				self.parentname = self.base_url.pop()
				self.fieldname = self.base_url.pop()
				# connect using parent name and get first 100 of the field name
				self.crawler_db = settings['CRAWLER_DB']
				db = client[self.crawler_db]
				collection = db[self.parentname]
				if self.runtype == 'resume':
					db_focra = client['FocraDB']
					cursor_focra = db_focra['crawler'].find_one({'_id': self.cname})
					self.queue_counter = cursor_focra.get('queue_counter')
					self.next_page_link = cursor_focra.get('next_page_link')
					self.crawled_pages = cursor_focra.get('crawled_pages')
					self.start_time = self.start_time - cursor_focra.get('time_executed')
					print self.cname + " - Loading Queue from " + str(self.queue_counter)
					cursor = collection.find({}, {self.fieldname: 1}).skip(self.queue_counter).limit(LINK_NUMBER)
					self.queue_reload_counter = self.queue_reload_counter + LINK_NUMBER + self.queue_counter
				else:
					cursor = collection.find({}, {self.fieldname: 1}).limit(LINK_NUMBER)
					# set the queue reload counter
					self.queue_reload_counter += LINK_NUMBER
				client.close()
				
				if cursor.count() <= self.queue_reload_counter:
					print self.cname + '- No more links to load'
					self.end_of_data = True
						
				# put it into queue
				for link in cursor:
					if link.get(self.fieldname):
						soup = BeautifulSoup(link.get(self.fieldname))
						# to see the links added to queue
						#print soup.a['href']
						self.queue.put(soup.a['href'])
				
				# if resume
				if self.next_page_link:
					self.base_url = [self.next_page_link]
					print self.cname + " - Resume page is: " + self.base_url[0]
					self.start_urls = self.base_url
				else:
					self.base_url = [self.queue.get()]
					if self.queue_counter == 0:
						self.queue_counter += 1
						print self.cname + " - Start page is: " + self.base_url[0]
					else:
						print self.cname + " - Resume page is: " + self.base_url[0]
					self.start_urls = self.base_url
		except Exception as error:
			print error
	
	# interrupted state, crawler status determined by views.py
	# it is stopped or paused
	def stopped(self):
		try:
			if self.runtype != 'complete':
				print self.cname + " - Stopped"
				db = client['FocraDB']
				collection = db['crawler']
				# chain crawler queue from parent crawler
				if self.queue_counter != 0:
					collection.update({"_id": self.cname}, {"$set":{'queue_counter': self.queue_counter, 
																 	'crawled_pages': self.crawled_pages,
																 	'time_executed': time.time() - self.start_time}})
					print self.cname + " - Saved queue counter is: " + str(self.queue_counter)
				# main or chained crawler pager state
				if self.pager != 'null' and self.next_page_link:
					collection.update({"_id": self.cname}, {"$set":{'next_page_link': self.next_page_link,
																 	'crawled_pages': self.crawled_pages,
																 	'time_executed': time.time() - self.start_time}})
					print self.cname + " - Saved Page link is: " + str(self.next_page_link)
				client.close()
		except Exception as err:
			print err
	
	# closed gracefully, crawler status complete
	def idle(self):
		try:
			# crawl completed
			if self.status == 'running':
				db = client['FocraDB']
				collection = db['crawler']
				collection.update({"_id": self.cname}, {"$set":{'crawlerAddr': '',
																'crawlerStatus': 'completed',
																'crawled_pages': self.crawled_pages,
																'time_executed': time.time() - self.start_time}})
				print self.cname + " - Crawl completed, closing gracefully"
				self.runtype = 'complete'
				client.close()
		except Exception as err:
			print err
			
	def parse(self, response):		
		try:
			self.crawled_pages += 1
			db = client['FocraDB']
			db['crawler'].update({"_id": self.cname}, {"$set":{'crawled_pages': self.crawled_pages,
																'time_executed': time.time()-self.start_time}})
			print self.cname + " - Parsing items"
			body = BeautifulSoup(response.body)
			
			for tag in body.find_all('a', href=True):
				if 'http' not in tag['href']:
					tag['href'] = urljoin(self.base_url[0], tag['href'])
			for tag in body.find_all('img', src=True):
				if 'http' not in tag['src']:
					tag['src'] = urljoin(self.base_url[0], tag['src'])
			for t in body.find_all('tbody'):
				t.unwrap()
			
			response = response.replace(body=body.prettify(encoding='ascii'))
			
			dynamicItemLoader = ItemLoader(item=self.item, response=response)

			if self.parentname is not None:
				self.item.clear()
				self.item.fields['request_url'] = Field()
				dynamicItemLoader.add_value("request_url", response.url)

			'''
			new codes
			'''
			r = None
			d = {}
			for k, v in self.template.iteritems():
				d[k] = v.split('/')

			lca = None
			if self.lcam:
				lca = self.lcam
			else:
				lca = self.longest_common_ancestor(d)
				self.lcam = lca
				print lca
			
			if lca:
				r = response.xpath(lca).extract()				
				if r:
					if len(r) <= 1:
						for key, value in self.template.iteritems():
							self.item.fields[key] = Field()
							dynamicItemLoader.add_xpath(key, value)
					else:
						for i in range(len(r)):	
							# data region
							#print r[i].encode('ascii', 'ignore')
							sel = Selector(text=r[i])
							
							for key, value in self.template.iteritems():
								
								self.item.fields[key] = Field()
								
								#print self.get_xpath_tail(lca, value)
								
								x = sel.xpath(self.get_xpath_tail(lca, value)).extract()
								
								x = ''.join(x)
								if x.startswith('<a') or x.startswith('<img'):
									dynamicItemLoader.add_value(key, x)
								else:
									sb = ""
									for string in BeautifulSoup(x).stripped_strings:
										sb += "\n" + string
									dynamicItemLoader.add_value(key, sb)
								
			else:
				for key, value in self.template.iteritems():
					#print value
					self.item.fields[key] = Field()
					dynamicItemLoader.add_xpath(key, value)
			
			print "yielded dynamic loader"
			yield dynamicItemLoader.load_item()
			
			# after scraping the page, check status to see whether we should stop
			self.status = db['crawler'].find_one({"_id":self.cname}).get('crawlerStatus')
			if self.status == 'stopped' or self.status == 'paused':
				raise CloseSpider('stopped')
			
			# check for pagination
			if self.pager != 'null':
				next_link = None
				# if the pager is in html format
				if bool(BeautifulSoup(self.pager, "html.parser").find()):
					# remove the \r for 'end of line' diff
					self.pager = self.pager.replace('\r', '')
					a_tags = response.xpath('//a').extract()
					for tag in a_tags:
						if self.pager in tag:
							tag = BeautifulSoup(tag)
							next_link = tag.a.get('href')
							break
				# if the pager is in text format
				else:
					if response.xpath('//a[text()[normalize-space()="'+ self.pager +'"]]/@href').extract():
						next_link = response.xpath('//a[text()[normalize-space()="'+ self.pager +'"]]/@href').extract()[0]
					
				
				if next_link:
					self.next_page_link = next_link
					print self.cname + ' - Next page is: ' + self.next_page_link
					print "yielded request top"
					yield Request(self.next_page_link, callback=self.parse, dont_filter=True)
					
				else:
					# chained crawler WITH pagination
					# check for more links from parent column
					if not self.queue.empty():
						k = self.queue.get()
						print "yielded request middle ---"+k
						yield Request(k, callback=self.parse, dont_filter=True)
						self.queue_counter += 1
						if self.queue.qsize() <= LINK_NUMBER and self.end_of_data == False:
							self.check_queue()
			else:
				# chained crawler WITHOUT pagination
				# check for more links from parent column
				if not self.queue.empty():
					l = self.queue.get()
					print "yielded request btm ---"+l
					yield Request(l, callback=self.parse, dont_filter=True)
					self.queue_counter += 1
					if self.queue.qsize() <= LINK_NUMBER and self.end_of_data == False:
						self.check_queue()
						
		except Exception as err:
			print err

	def check_queue(self):
		try:
			print self.cname + '- Reload counter ' + str(self.queue_reload_counter)
			print self.cname + '- Queue less than ' + str(LINK_NUMBER) + ', querying for more links'
			db = client[self.crawler_db]
			collection = db[self.parentname]
			cursor = collection.find({}, {self.fieldname: 1}).skip(self.queue_reload_counter).limit(LINK_NUMBER)
			client.close()
			self.queue_reload_counter += LINK_NUMBER
			# cursor count returns the total row
			if cursor.count() <= self.queue_reload_counter:
				print self.cname + '- No more links to load'
				self.end_of_data = True
			# put it into queue
			for link in cursor:
				if link.get(self.fieldname):
					soup = BeautifulSoup(link.get(self.fieldname))
					# uncomment below to see queue links
					#print soup.a['href']
					self.queue.put(soup.a['href'])
		except Exception as err:
			print err	
	
	'''
	find the lowest common ancestor
	'''
	def longest_common_ancestor(self, d):
		
		if len(d) < 1:
			return None
		
		p = None
		for l in d.values():
			if len(l) < p or p is None:
				p = len(l)
	
		diff_index = None
		
		for i in range(p):
			check = None
			for v in d.itervalues():
				if check is None or check == v[i]:
					check = v[i]
				elif check != v[i]:
					diff_index = i
					break
			if diff_index:
				break
					
		if diff_index:
			# return None if root note is '/body' which is 2
			# return None if root note is '/html' which is 1
			# return None if root note is '/'  which is 0
			if diff_index < 3:
				return None
			sb = ""
			for i in range(diff_index):
				if i != 0:	
					sb += "/" + d.values()[0][i]
			return sb
		
		return None
	
	def get_xpath_tail(self, lca, value):
		last = lca.split("/")
		return '//' + re.sub('[^A-Za-z]+', '', last[len(last)-1]) + value.replace(lca, "", 1)
Example #40
0
    def __init__(self, items, ext=None):
        Item.__init__(self)
        for k, v in items.items():
            self[k] = v

        self['ext'] = ext
Example #41
0
	def __init__(self, stats=None, settings=None, **kwargs):
		super(FocraSpider, self).__init__(**kwargs)
		try:
			self.start_time = time.time()
			print 'focras init(' + self.cname + ') kwargs seeds ' + kwargs.get('seeds')
			print 'focras init(' + self.cname + ') kwargs template '+ self.template
			self.queue = Queue.Queue()
			self.queue_counter = 0
			self.queue_reload_counter = 0
			# to save the state of the pagination
			self.next_page_link = None
			self.end_of_data = False
			self.template = json.loads(self.template, object_pairs_hook=collections.OrderedDict)
			self.item = Item()
			self.pager = HTMLParser().unescape(self.pager)
			self.base_url = kwargs.get('seeds').split(',')
			self.crawled_pages = 0
			self.status = None
			self.lcam = None
			
			# non chain crawler dont have a queue, check for pager only
			# chain crawler url does not start with http
			if self.base_url[0].startswith('http'):
				# for request_url of chain crawler
				self.parentname = None
				if self.runtype == 'resume' and self.pager != 'null':
					db = client['FocraDB']
					collection = db['crawler']
					cursor_focra = collection.find_one({'_id':self.cname})
					self.base_url = [cursor_focra.get('next_page_link')]
					self.crawled_pages = cursor_focra.get('crawled_pages')
					self.start_time = self.start_time - cursor_focra.get('time_executed')
					client.close()
					print self.cname + " - Resume page is: " + self.base_url[0]
					self.start_urls = self.base_url
				else:
					print self.cname + " - Start page is: " + self.base_url[0]
					self.start_urls = self.base_url
			else:
				# chain crawler
				# get parent and field info from seeds
				self.parentname = self.base_url.pop()
				self.fieldname = self.base_url.pop()
				# connect using parent name and get first 100 of the field name
				self.crawler_db = settings['CRAWLER_DB']
				db = client[self.crawler_db]
				collection = db[self.parentname]
				if self.runtype == 'resume':
					db_focra = client['FocraDB']
					cursor_focra = db_focra['crawler'].find_one({'_id': self.cname})
					self.queue_counter = cursor_focra.get('queue_counter')
					self.next_page_link = cursor_focra.get('next_page_link')
					self.crawled_pages = cursor_focra.get('crawled_pages')
					self.start_time = self.start_time - cursor_focra.get('time_executed')
					print self.cname + " - Loading Queue from " + str(self.queue_counter)
					cursor = collection.find({}, {self.fieldname: 1}).skip(self.queue_counter).limit(LINK_NUMBER)
					self.queue_reload_counter = self.queue_reload_counter + LINK_NUMBER + self.queue_counter
				else:
					cursor = collection.find({}, {self.fieldname: 1}).limit(LINK_NUMBER)
					# set the queue reload counter
					self.queue_reload_counter += LINK_NUMBER
				client.close()
				
				if cursor.count() <= self.queue_reload_counter:
					print self.cname + '- No more links to load'
					self.end_of_data = True
						
				# put it into queue
				for link in cursor:
					if link.get(self.fieldname):
						soup = BeautifulSoup(link.get(self.fieldname))
						# to see the links added to queue
						#print soup.a['href']
						self.queue.put(soup.a['href'])
				
				# if resume
				if self.next_page_link:
					self.base_url = [self.next_page_link]
					print self.cname + " - Resume page is: " + self.base_url[0]
					self.start_urls = self.base_url
				else:
					self.base_url = [self.queue.get()]
					if self.queue_counter == 0:
						self.queue_counter += 1
						print self.cname + " - Start page is: " + self.base_url[0]
					else:
						print self.cname + " - Resume page is: " + self.base_url[0]
					self.start_urls = self.base_url
		except Exception as error:
			print error
Example #42
0
 def __init__(self,other=None):
     Item.__init__(self)
Example #43
0
 def __init__(self, **kargs):
     kargs.update(kind='user')
     Item.__init__(self, **kargs)
Example #44
0
 def __init__(self):
     Item.__init__(self)
     cityid = 0
     url = ""
     name = ""
     parentcityid = 0