Ejemplo n.º 1
0
	def _task_match(self):
		"""
		"match" islemini calistirir.
                """

                self.logger.info("starting matcher...")

		#get # of workers
		workers=int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

		#run in test mode?
		self.test=("test" in self.task.meta and self.task.meta["test"] is True)  

		#additional stats
		self.task.stats["test"]["guessed"]=0

		#oncomplete handler
		def oncomplete():
			#finalize tests
			if self.test:
				self._test_finalize()
			
			#mark as completed
			self._complete()

		#match items
		self.task.results=[]
		@defer.inlineCallbacks
		def matchitem(item,work):
			try:
				#match
				result=yield self._match_item(item["data"])

				#comparison test
				if self.test:
					#comparet to reference match and update test stats
					self._test_update(result,item["meta.refitem"])		
				
					#include in results
					result["meta.refitem"]=item["meta.refitem"]

				#add to results
				if result["meta.action"] is not None:
					self.task.result.append(result)

		
			except Exception as e:
				data=Template("item(${id}): $url").substitute(id=replace_turkish_chars(item["data"].merchantItemId),url=item["data"].merchantItemUrl)
				self._log_exception(e,data=data)
			
			#update progress		
			self._progress()

			#next item 
			work.next()


		#initialize progress tracker
		self._progress(len(self.task.data))
			
		#distribute tesk
		d=Distributor(matchitem,oncomplete=oncomplete,workers=workers)
		d.run()
		d.adddata(self.task.data)
		d.complete()	#complete when all data is processed
Ejemplo n.º 2
0
    def _task_get(self):
        """
		"get" isini calistirir
		"""

        self.logger.info("starting crawler...")

        # get number of workers
        workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

        # get merchant ids we're looking for
        merchantids = list(set([it.merchant["merchantId"] for it in self.task.data]))

        # get list of merchants according to task
        merchants = []
        for id in merchantids:
            merchant = self._get_merchant(id)
            if merchant is not None:
                merchants.append(merchant)

                # worker to check items

        @defer.inlineCallbacks
        def checkitems(merchant, work):
            try:
                # get item ids we're looking for
                ids = [it.merchantItemId for it in self.task.data if it.merchant["merchantId"] == merchant.merchantId]

                # get items for merchant
                items = yield self._get_merchant_items(merchant)
                if items is None:
                    self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl())
                    items = []  # TEMP workaround to make sure task completes

                    # get merchant items to sample for merchant
                items = [item for item in items if item.merchantItemId in ids]

                # process items
                for item in items:
                    # TEMP - do not check
                    # check page
                    # res=yield self._check_item(item)
                    res = True

                    # set task result
                    if res:
                        self.task.result.append(
                            {"data": item.merchantItemUrl, "meta.merchantid": merchant.merchantId, "meta.xmlitem": item}
                        )
                    else:
                        msg = Template("item(${id}) url could not verified: $url").substitute(
                            id=str(item.merchantItemId), url=item.merchantItemUrl
                        )
                        self._log_error(msg)

                        # update progress
                    self._progress()

            except Exception as e:
                data = Template("exception while processing merchant(${id}): $url").substitute(
                    id=str(merchant.merchantId), url=MerchantXML(merchant).geturl()
                )
                self._log_exception(e, data)

                # next item
            work.next()

            # initialize progress tracker

        self._progress(len(self.task.data))

        # start task distributor
        d = Distributor(checkitems, oncomplete=self._complete, workers=workers)
        d.run()

        # process each merchant
        d.adddata([merchant for merchant in merchants if merchant.merchantId in merchantids])

        # when done processing all data
        d.complete()
Ejemplo n.º 3
0
    def _task_crawl(self):
        """
		"crawl" isini calistirir
		"""

        self.logger.info("starting crawler...")

        # get number of workers
        workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

        # get list of merchants according to task
        # parmeters: range, status,...
        allmerchants = "merchants.all" in self.task.meta and self.task.meta["merchants.all"] is True
        merchants = self._get_merchants(allmerchants)

        # get range to operate on
        if "merchants.id.alt" in self.task.meta:
            merchants = [
                merchant
                for merchant in merchants
                if str(merchant.merchantId) == str(self.task.meta["merchants.id.alt"])
            ]
        elif "merchants.id" in self.task.meta:
            merchants = [
                merchant for merchant in merchants if str(merchant.merchantId) == str(self.task.meta["merchants.id"])
            ]
        else:
            if "merchants.index" in self.task.meta:
                merchants = merchants[int(self.task.meta["merchants.index"]) :]
            if "merchants.range" in self.task.meta:
                merchants = merchants[: int(self.task.meta["merchants.range"])]

                # progress steps for each merchant
        steps = 10000

        # check items
        @defer.inlineCallbacks
        def checkitems(merchant, work):
            try:
                # reet progress counter
                counter = steps

                # get items for merchant
                try:
                    items = yield self._get_merchant_items(merchant)
                except:
                    items = None
                if items is None:
                    self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl())
                    items = []  # TEMP workaround to make sure task completes

                    # progress counter size
                stepby = steps if len(items) == 0 else float(steps) / len(items)

                # get range to operate on
                if "merchants.items.index" in self.task.meta:
                    items = items[int(self.task.meta["merchants.items.index"]) :]
                if "merchants.items.range" in self.task.meta:
                    items = items[: int(self.task.meta["merchants.items.range"])]

                    # check items
                for item in items:
                    # TEMP - do not check
                    # check page
                    # res=yield self._check_item(item)
                    res = True

                    # set task result
                    if res:
                        self.task.result.append(
                            {"data": item.merchantItemUrl, "meta.merchantid": merchant.merchantId, "meta.xmlitem": item}
                        )
                    else:
                        msg = Template("item(${id}) url could not verified: $url").substitute(
                            id=str(item.merchantItemId), url=item.merchantItemUrl
                        )
                        self._log_error(msg)

                    self._progress(stepby=stepby)
                    counter = counter - stepby

            except Exception as e:
                data = Template("exception while processing merchant(${id}): $url").substitute(
                    id=str(merchant.merchantId), url=MerchantXML(merchant).geturl()
                )
                self._log_exception(e, data=data)
                self._fail()

                # update progress
            self._progress(stepby=counter)

            # next item
            work.next()

            # initialize progress tracker

        self._progress(len(merchants) * steps)

        # start task distributor
        d = Distributor(checkitems, oncomplete=self._complete, workers=workers)
        d.run()

        # process each merchant
        d.adddata(merchants)

        # when done processing all data
        d.complete()
Ejemplo n.º 4
0
    def _task_sample(self):
        """
		"sample" isini calistirir
		"""

        self.logger.info("starting crawler 'sample' task...")

        # get number of workers
        workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

        # get list of merchants
        merchants = self._get_merchants()

        # apply filters
        if "merchants.id" in self.task.meta:
            merchants = [
                merchant
                for merchant in merchants
                if str(merchant.merchantId).strip() == str(self.task.meta["merchants.id"]).strip()
            ]
        self.count = int(self.task.meta["sample.size"]) if "sample.size" in self.task.meta else 100

        # process each merchant
        for merchant in merchants:
            try:
                # get items for merchant
                items = yield self._get_merchant_items(merchant)
                if items is None:
                    self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl())
                    items = []  # TEMP workaround to make sure task completes

                    # shuffle the items
                shuffle(items)

                # check items
                @defer.inlineCallbacks
                def checkitem(item, work):
                    try:
                        # check page
                        res = yield self._check_item(item)

                        # check if task was completed while we were waiting to get the item
                        if work.isactive():
                            if res:
                                # set task result
                                self.task.result.append(
                                    {
                                        "data": item.merchantItemUrl,
                                        "meta.merchantid": merchant.merchantId,
                                        "meta.xmlitem": item,
                                    }
                                )

                                # update progress
                                self._progress()

                                # done?
                                self.count = self.count - 1
                                if self.count < 1:
                                    work.complete()

                            else:
                                msg = Template("item(${id}) url could not verified: $url").substitute(
                                    id=str(item.merchantItemId), url=item.merchantItemUrl
                                )
                                self._log_error(msg)

                    except Exception as e:
                        data = Template("exception while processing item(${id}): $url").substitute(
                            id=str(merchant.merchantId), url=item.merchantItemUrl
                        )
                        self._log_exception(e, data=data)

                        # next item
                    work.next()

                    # initialize progress tracker

                self._progress(self.count)

                # distribute tesk
                d = Distributor(checkitem, oncomplete=self._complete, workers=workers)
                d.run()
                d.adddata(items)
                d.complete()  # when done processing all data

            except Exception as e:
                data = Template("exception while processing merchant(${id}): $url").substitute(
                    id=str(merchant.merchantId), url=MerchantXML(merchant).geturl()
                )
                self._log_exception(e, data=data)
                self._fail()
Ejemplo n.º 5
0
	def _task_update(self):
                """
		"update" islemini calistirir
                """

		self.logger.info("api call...")

		#uddater stats
		self.task.stats["data"]["actions"]={"total":0, "insert": 0, "update":0, "clean":0}		

		#get # of workers
		workers=int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

		#oncomplete handler
		def oncomplete():
			#mark as completed
			self._complete()

		#update
		@defer.inlineCallbacks
		def updateitems(items,work):
			#clean task
			data=[item["data"] for item in items if item["meta.action"]=="clean"]
			chunks=int(math.ceil(float(len(data))/self._update_size))
			for index in range(chunks):
				try:
					#update
					res=yield self._clean(data[index*self._update_size:(index+1)*self._update_size])
			
					#check result
	
	                        except Exception as e:
                	                self._log_exception(e,data=None)

				#update progress		
				self._progress()
	
			#update/insert tasks
			data=[item["data"] for item in items if item["meta.action"] in ["insert","update"]]
			chunks=int(math.ceil(float(len(data))/self._update_size))
			for index in range(chunks):
				try:
					#update
					res=yield self._update(data[index*self._update_size:(index+1)*self._update_size])
			
					#check result
	
	                        except Exception as e:
                	                self._log_exception(e,data=None)

				#update progress		
				self._progress()
	
			#next item
			work.next()
			

		#partition data by merchants
		merchants={}
		for item in self.task.data:
			mid=item["data"].merchant["merchantId"]
			if mid not in merchants:
				merchants[mid]=[]
			merchants[mid].append(item)

			#record stats
			self.task.stats["data"]["actions"]["total"]=self.task.stats["data"]["actions"]["total"]+1
			self.task.stats["data"]["actions"][item["meta.action"]]+=1

		#figure out progress
		size=0
		for id in merchants:			
			cleanops=len([item for item in merchants[id] if item["meta.action"]=="clean"])
			size=size+int(math.ceil(float(cleanops)/self._update_size))
			size=size+int(math.ceil(float(len(merchants[id])-cleanops)/self._update_size))

		#initialize progress tracker
		self._progress(size)
			
		#distribute tesk
		d=Distributor(updateitems,oncomplete=oncomplete,workers=workers)
		d.run()
		d.adddata(merchants.values())
		d.complete()	#complete when all data is processed