Example #1
class SandboxUpdater(Updater):
	SandboxUpdater gelistirme sureci ve testler icindir. Cimri Service'de herhangi bir veri guncellemez.

	Modul tarafindan desteklenen islem turleri ve opsiyonlari su sekildedir:

        "update" islemi: belirlenen update direktiflerini simule eder

                task.data	: islemden gecirilmesi istenilen her item icin "data" o item'in bilgilerini iceren
				  MerchantItem olmalidir (L{cimri.api.cimriservice.merchantitem.MerchantItem})

				  bunun yaninda, her item icin asagidaki veriler de saglanmalidir:

                                  "meta.action"                 : "update" ya da "insert" degerini icermelidir.
								  item uzerinde backendde yapilmasi istenilen
								  operasyonu belirler.

                task.meta       : -

                task.result     : islemden gecirilmesi istenilen her item icin "data" o item icin guncellenmis
				  MerchantItem'dir (L{cimri.api.cimriservice.merchantitem.MerchantItem})

				  bunun yaninda, her item icin asagidaki veriler de saglanir:

                                  "meta.result"	                : "success" ya da "fail" degerlerinden birini icerir

				  "meta.error"			: eger operasyonda bir hata olusursa, hata hakkinda
								  bilgiler icerir


	def __init__(self):
		#initialize parents

		#supported operations
		self.ops={"update"	:self._task_update}

	def _task_update(self):
		"update" islemini calistirir

		self.logger.info("api call...")

		#set result
		self.result=[{"data":item["data"], "result":"success"} for item in self.task.data]		
Example #2
class DictionaryChunker:
	aho-corasick chunker algoritmasi







    def __init__(self, dictionary):
                @type  dictionary: list
                @param dictionary: chunker icin kullanilacak terim sozlugu

        self.logger = Logger(self.__class__.__name__)

        # build dictionary
        self.logger.info("generating dictionary...")
        self.tree = ahocorasick.KeywordTree()
        for item in dictionary:
            if item.strip() != "":
        self.logger.info("finished generating dictionary...")

    def find_all(self, text):
                bir text icinde belirlenen sozlukte bulunan butun terimleri arar.

                @type  text: string
                @param text: icinde terimlerin aranacagi text

                @rtype: list
                @return: bulunan sozluk terimleri

        return [text[match[0] : match[1]] for match in self.tree.findall(text, allow_overlaps=1)]
Example #3
class XMLScrapper(Scrapper,Web):
	XMLScrapper bir merchant xml'den alinan merchant itemlarini sistem islem akisi kurallari icinde bir 
	scraper modulu tarafindan yaratilmis olarak formatlar. Gercek anlamda bir scraper degildir.

	Modul asagidaki islemleri destekler:

        "scrap"	islemi :

                task.data      	: islem verileri icindeki her item icin "data" o item'in URL olmalidir. ayrica her item icin
				  asagidaki veriler bulunmalidir:

                                  "meta.merchantid"             : item'in merchant IDsi

                                  "meta.xmlitem" 	        : item bilgilerini iceren MerchanItem objecti

                task.result     : her item icin "data" o itemin bilgilerini iceren bir MerchanItem olacaktir.


	def __init__(self):
		#initialize parents

		#supported operations
		self.ops={"scrap"		:self._task_scrap}

	def _task_scrap(self):
		"scrap" isini calistirir

		self.logger.info("api call...")

		#translate data	
		self.task.result=[ {"data":item["meta.xmlitem"]} for item in self.task.data ]

		#mark as completed
Example #4
class ProductXMLCrawler(ProductCrawler, Web):
	ProductXMLCrawler urun bilgilerini web siteleri dolasarak bulma bakimindan gercek bir crawler degildir.
	Urun bilgilerini Merchant XML'lerden alir, sistem icindeki normal bir web crawlerinin islem akisi
	icinde calistigi ayni seviyede calisir. Sistemin geri kalanina gercek bir crawler olarak gorunur.

	Modul tarafindan desteklenen islem turleri ve opsiyonlari su sekildedir:

	"crawl" islemi: bir ya da daha fazla merchant XML'i tarayarak icinde bulunan merchant itemlari bulur

		task.data	: -

		task.meta	: 
				  "merchants.id"		: aktif merchantlar arasinda aranan cimri-service 
								  merchant IDsi. eger bu opsiyon icin bir deger 
								  verildiyse merchants.index ve merchant.range 
								  opsiyonlari dikkate alinmaz 

       				  "merchants.id.alt"		: butun merchantlar arasinda aranan cimri-service
								  merchant IDsi. eger bu opsiyon icin bir deger 
								  verildiyse merchants.id, merchants.index, ve
								  merchants.range opsiyonlari dikkate alinmaz

       				  "merchants.index"		: aktif merchantlar arasinda crawl islemi
								  icin kullanilacak ilk merchantin indexi.
								  eger bir deger verilmediyse 0 kullanilir

        			  "merchants.range"		: aktif merchantlar arasinda crawl islemi
								  icin kullanilacak merchantlarin sayisi.
								  eger bir deger verilmediye merchants.index
								  indexli merchanttan baslayarak butun merchantlar

        			  "merchants.items.index"	: islem icin kullanilan bir merchantin itemlari 
								  arasinda isleme alinacak ilk itemin indexi.
								  eger bir deger verilmediyse 0 kullanilir.

        			  "merchants.items.range"	: islem icin kullanilan bir merchant icin isleme
								  alinacak itemlarin sayisi. eger bir deger 
								  verilmediyse merchants.items.index indexli
								  itemdan baslayarak butun itemlar isleme alinir.

				  "merchants.all"		: eger bu opsiyon varsa sadece aktif merchantlar
								  degil butun merchantlar islem icin dikkate alinacaktir.

				  "cache.read"			: eger bu opsiyon varsa islem bilgileri cache'den
								  alinacaktir. eger opsiyonun bir degeri varsa
								  cache'in o bolumu kullanilir, eger opsiyonun
								  bir degeri yoksa genel cache kullanilir.

				  "cache.write"			: eger bu opsiyon varsa islem sonuclar cache'e
								  yazilacaktir. eger opsiyonun bir degeri varsa
								  cache'in o bolumu kullanilir, eger opsiyonun
								  bir degeri yoksa genel cache kullanilir.

		task.result	: islem sonucu olarak bulunan her item icin "data" o item'in urli olacaktir.

				  ayrica, her item icin asagidaki meta veriler de sonuclara dahil edilir;

				  "meta.merchantid"		: bulunan item'in mechant IDsi

				  "meta.xmlitem"		: bulunan item bilgilerini iceren MerchantItem

	"sample" islemi: bir  merchant XML'i tarayarak random bir sayida merchant item bulur.
			 cesitli testler icin ornek veriler yaratmak icin kullanilir.

		task.data	: -

		task.meta	:
				  "merchants.id"		: aktif merchantlar arasinda aranan cimri-service 
								  merchant IDsi. eger bu opsiyon icin bir deger 
								  verildiyse merchants.index ve merchant.range 
								  opsiyonlari dikkate alinmaz 
				  "sample.size"			: bulunmasi istenilen item sayisi 
				  "cache.read"			: eger bu opsiyon varsa islem bilgileri cache'den
								  alinacaktir. eger opsiyonun bir degeri varsa
								  cache'in o bolumu kullanilir, eger opsiyonun
								  bir degeri yoksa genel cache kullanilir.

				  "cache.write"			: eger bu opsiyon varsa islem sonuclar cache'e
								  yazilacaktir. eger opsiyonun bir degeri varsa
								  cache'in o bolumu kullanilir, eger opsiyonun
								  bir degeri yoksa genel cache kullanilir.

		task.result	: islem sonucu olarak bulunan her item icin "data" o item'in urli olacaktir.

				  ayrica, her item icin asagidaki meta veriler de sonuclara dahil edilir;

				  "meta.merchantid"		: bulunan item'in mechant IDsi

				  "meta.xmlitem"		: bulunan item bilgilerini iceren MerchantItem

	"get"   islemi: belirtilen merchant item'lar icin merchant XML'lerinden MerchantItem bilgilerini bulur

                task.data      	: bulunmasi istenen her item icin "data" o item'in merchantId ve merchantItemId'sini
				  iceren MerchantItem objecti olmalidir (L{cimri.api.cimriservice.data.merchantitem.MerchantItem})

		task.meta	: 
				  "cache.read"			: eger bu opsiyon varsa islem bilgileri cache'den
								  alinacaktir. eger opsiyonun bir degeri varsa
								  cache'in o bolumu kullanilir, eger opsiyonun
								  bir degeri yoksa genel cache kullanilir.

				  "cache.write"			: eger bu opsiyon varsa islem sonuclar cache'e
								  yazilacaktir. eger opsiyonun bir degeri varsa
								  cache'in o bolumu kullanilir, eger opsiyonun
								  bir degeri yoksa genel cache kullanilir.

		task.result	: islem sonucu olarak bulunan her item icin "data" o item'in urli olacaktir.

				  ayrica, her item icin asagidaki meta veriler de sonuclara dahil edilir;

				  "meta.merchantid"		: bulunan item'in mechant IDsi

				  "meta.xmlitem"		: bulunan item bilgilerini iceren MerchantItem

	"discover" islemi: kullanilmamaktadir.


    def __init__(self):

        self.logger = Logger(self.__class__.__name__)

        # supported operations
        self.ops = {
            "discover": self._task_discover,
            "crawl": self._task_crawl,
            "sample": self._task_sample,
            "get": self._task_get,

    def _task_discover(self):


    def _task_crawl(self):
		"crawl" isini calistirir

        self.logger.info("starting crawler...")

        # get number of workers
        workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

        # get list of merchants according to task
        # parmeters: range, status,...
        allmerchants = "merchants.all" in self.task.meta and self.task.meta["merchants.all"] is True
        merchants = self._get_merchants(allmerchants)

        # get range to operate on
        if "merchants.id.alt" in self.task.meta:
            merchants = [
                for merchant in merchants
                if str(merchant.merchantId) == str(self.task.meta["merchants.id.alt"])
        elif "merchants.id" in self.task.meta:
            merchants = [
                merchant for merchant in merchants if str(merchant.merchantId) == str(self.task.meta["merchants.id"])
            if "merchants.index" in self.task.meta:
                merchants = merchants[int(self.task.meta["merchants.index"]) :]
            if "merchants.range" in self.task.meta:
                merchants = merchants[: int(self.task.meta["merchants.range"])]

                # progress steps for each merchant
        steps = 10000

        # check items
        def checkitems(merchant, work):
                # reet progress counter
                counter = steps

                # get items for merchant
                    items = yield self._get_merchant_items(merchant)
                    items = None
                if items is None:
                    self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl())
                    items = []  # TEMP workaround to make sure task completes

                    # progress counter size
                stepby = steps if len(items) == 0 else float(steps) / len(items)

                # get range to operate on
                if "merchants.items.index" in self.task.meta:
                    items = items[int(self.task.meta["merchants.items.index"]) :]
                if "merchants.items.range" in self.task.meta:
                    items = items[: int(self.task.meta["merchants.items.range"])]

                    # check items
                for item in items:
                    # TEMP - do not check
                    # check page
                    # res=yield self._check_item(item)
                    res = True

                    # set task result
                    if res:
                            {"data": item.merchantItemUrl, "meta.merchantid": merchant.merchantId, "meta.xmlitem": item}
                        msg = Template("item(${id}) url could not verified: $url").substitute(
                            id=str(item.merchantItemId), url=item.merchantItemUrl

                    counter = counter - stepby

            except Exception as e:
                data = Template("exception while processing merchant(${id}): $url").substitute(
                    id=str(merchant.merchantId), url=MerchantXML(merchant).geturl()
                self._log_exception(e, data=data)

                # update progress

            # next item

            # initialize progress tracker

        self._progress(len(merchants) * steps)

        # start task distributor
        d = Distributor(checkitems, oncomplete=self._complete, workers=workers)

        # process each merchant

        # when done processing all data

    def _task_sample(self):
		"sample" isini calistirir

        self.logger.info("starting crawler 'sample' task...")

        # get number of workers
        workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

        # get list of merchants
        merchants = self._get_merchants()

        # apply filters
        if "merchants.id" in self.task.meta:
            merchants = [
                for merchant in merchants
                if str(merchant.merchantId).strip() == str(self.task.meta["merchants.id"]).strip()
        self.count = int(self.task.meta["sample.size"]) if "sample.size" in self.task.meta else 100

        # process each merchant
        for merchant in merchants:
                # get items for merchant
                items = yield self._get_merchant_items(merchant)
                if items is None:
                    self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl())
                    items = []  # TEMP workaround to make sure task completes

                    # shuffle the items

                # check items
                def checkitem(item, work):
                        # check page
                        res = yield self._check_item(item)

                        # check if task was completed while we were waiting to get the item
                        if work.isactive():
                            if res:
                                # set task result
                                        "data": item.merchantItemUrl,
                                        "meta.merchantid": merchant.merchantId,
                                        "meta.xmlitem": item,

                                # update progress

                                # done?
                                self.count = self.count - 1
                                if self.count < 1:

                                msg = Template("item(${id}) url could not verified: $url").substitute(
                                    id=str(item.merchantItemId), url=item.merchantItemUrl

                    except Exception as e:
                        data = Template("exception while processing item(${id}): $url").substitute(
                            id=str(merchant.merchantId), url=item.merchantItemUrl
                        self._log_exception(e, data=data)

                        # next item

                    # initialize progress tracker


                # distribute tesk
                d = Distributor(checkitem, oncomplete=self._complete, workers=workers)
                d.complete()  # when done processing all data

            except Exception as e:
                data = Template("exception while processing merchant(${id}): $url").substitute(
                    id=str(merchant.merchantId), url=MerchantXML(merchant).geturl()
                self._log_exception(e, data=data)

    def _task_get(self):
		"get" isini calistirir

        self.logger.info("starting crawler...")

        # get number of workers
        workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

        # get merchant ids we're looking for
        merchantids = list(set([it.merchant["merchantId"] for it in self.task.data]))

        # get list of merchants according to task
        merchants = []
        for id in merchantids:
            merchant = self._get_merchant(id)
            if merchant is not None:

                # worker to check items

        def checkitems(merchant, work):
                # get item ids we're looking for
                ids = [it.merchantItemId for it in self.task.data if it.merchant["merchantId"] == merchant.merchantId]

                # get items for merchant
                items = yield self._get_merchant_items(merchant)
                if items is None:
                    self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl())
                    items = []  # TEMP workaround to make sure task completes

                    # get merchant items to sample for merchant
                items = [item for item in items if item.merchantItemId in ids]

                # process items
                for item in items:
                    # TEMP - do not check
                    # check page
                    # res=yield self._check_item(item)
                    res = True

                    # set task result
                    if res:
                            {"data": item.merchantItemUrl, "meta.merchantid": merchant.merchantId, "meta.xmlitem": item}
                        msg = Template("item(${id}) url could not verified: $url").substitute(
                            id=str(item.merchantItemId), url=item.merchantItemUrl

                        # update progress

            except Exception as e:
                data = Template("exception while processing merchant(${id}): $url").substitute(
                    id=str(merchant.merchantId), url=MerchantXML(merchant).geturl()
                self._log_exception(e, data)

                # next item

            # initialize progress tracker


        # start task distributor
        d = Distributor(checkitems, oncomplete=self._complete, workers=workers)

        # process each merchant
        d.adddata([merchant for merchant in merchants if merchant.merchantId in merchantids])

        # when done processing all data

    def _get_merchants(self, all=False):
		Cimri Service'den merchantlarin listesini alir

		@type all:	bool
		@param all:	eger dogru ise butun merchantlar alinir, aksi takdirde sadece aktif merchantlar alinir

		@rtype:	list (L{cimri.api.cimriservice.data.merchant.MerchantInfo})
		@return: merchant listesi

        self.logger.info("getting merchants...")

        # get active merchants
        api = MerchantsAPI()
        if all is True:
            merchants = api.get_merchants()
            merchants = api.get_merchants(status=MerchantInfo.STATUS_ACTIVE)
        if merchants is None:
            self._log_error("error getting cimri service merchant list")
            self.logger.warn("did not get any merchants from cimri-service")
            return []

        self.logger.info("number of merchants retrieved: " + str(len(merchants)))

        # get only the active ones
        # merchants=[merchant for merchant in merchants if merchant.status==MerchantInfo.STATUS_ACTIVE]

        self.logger.info("number of active merchants found: " + str(len(merchants)))

        return merchants

    def _get_merchant(self, id):
		Cimri service'den belli bir merchanti alir

		@type id: str
		@param id: merchant ID

		@rtype: L{cimri.api.cimriservice.data.merchant.MerchantInfo}
		@return: MerchantInfo objecti

        self.logger.info("getting merchant..." + str(id))

        # get active merchants
        api = MerchantsAPI()
        merchant = api.get_merchant(id)
        if merchant is None:
            self._log_error("error getting cimri service merchant " + str(id))
            self.logger.warn("did not get merchant from cimri-service")
            return None

        return merchant

    def _get_merchant_items(self, merchant):
		Bir merchant icin butun merchant itemlari asynchronous olarak cimri serviceden alir

		@type merchant:	L{cimri.api.cimriservice.data.merchant.MerchantInfo}
		@param merchant: itemlari istenen merchant

		@rtype: L{twisted.internet.defer.Deferred}
		@return: merchant itemlarin yollanacagi Deferred

        return deferToThread(self._get_merchant_items_async, merchant)

    def _get_merchant_items_async(self, merchant):
		Bir merchant icin butun merchant itemlari cimri serviceden alir

		@type merchant:	L{cimri.api.cimriservice.data.merchant.MerchantInfo}
		@param merchant: itemlari istenen merchant

                @rtype: list (L{cimri.api.cimriservice.data.merchantitem.MerchantItem})
                @return: istenen merchant itemlar

        self.logger.info("getting merchant items for ... " + str(merchant.merchantId) + ":" + merchant.merchantName)

        # check cache
        items = self._get_cached_merchant_items(merchant.merchantId)
        if items is None:
            # load items for merchant
            xml = MerchantXML(merchant)
            res = xml.load()
            if res is False:
                return None

                # get items
            items = xml.getitems()

            # cache
            self._cache_merchant_items(items, merchant.merchantId)

        # add merchant id
        for item in items:
            item.merchant = {"merchantId": merchant.merchantId}

        return items

    def _check_item(self, item):
		Bir merchant item'a ait URL'in erisilir olup olmadigini asynchronous olarak kontrol eder

                @type item: L{cimri.api.cimriservice.data.merchantitem.MerchantItem}
                @param item: kontrol edilmesi istenen merchant item

		@rtype: L{twisted.internet.defer.Deferred}
		@return: operasyon sonucunun yollanacagi Deferred

        return deferToThread(self.ping, item.merchantItemUrl)

    def _cache_merchant_items(self, items, merchantid):
		Belirlenen merchant itemlari cache'e kayit eder

                @type items: list (L{cimri.api.cimriservice.data.merchantitem.MerchantItem})
                @param items: kayit edilmesi istenen merchant itemlar

                @type merchantid: str
                @param merchantid: itemlarin ait oldugu merchantin IDsi

        # check if we should cache
        if "cache.write" not in self.task.meta:

            # get section to cache to
        section = self.task.meta["cache.write"]

        # write to cache
            "crawler.productxml.items." + str(merchantid), json.dumps([item.to_dict() for item in items])

    def _get_cached_merchant_items(self, merchantid):
		Belirli bir merchant icin cache'de kayitli merchant itemlari alir

                @type merchantid: str
                @param merchantid: itemlarin ait oldugu merchantin IDsi

                @rtype: list (L{cimri.api.cimriservice.data.merchantitem.MerchantItem})
                @return: cache'de bulunan itemlarin listesi. cache bolumu yoksa ya da bir hata olusursa None

        # check if we should use the cache
        if "cache.read" not in self.task.meta:
            return None

            # get section to use
        section = self.task.meta["cache.read"]

        # read cache
        content = Cache(section).get("crawler.productxml.items." + str(merchantid))
        if content is None:
            return None

            # parse
            return MerchantItem.list_from_json(content, [])

        except Exception as e:
            return None
Example #5
class CimriServiceUpdater(Updater):
	CimriServiceUpdater sistem icindeki diger modullerin belirlenen sonuclarini alir ve CimriService uzerinde 
	belirlenen itemlar icin guncellemeler yapar.

	Modul tarafindan desteklenen islem turleri ve opsiyonlari su sekildedir:

        "update" islemi: belirlenen itemlari Cimri Service'de update eder ya da ekler

                task.data	: islemden gecirilmesi istenilen her item icin "data" o item'in bilgilerini iceren
				  MerchantItem olmalidir (L{cimri.api.cimriservice.merchantitem.MerchantItem})

				  bunun yaninda, her item icin asagidaki veriler de saglanmalidir:

                                  "meta.action"                 : "update" ya da "insert" degerini icermelidir.
								  item uzerinde backendde yapilmasi istenilen
								  operasyonu belirler.

                task.meta       : -

                task.result     : islemden gecirilmesi istenilen her item icin "data" o item icin guncellenmis
				  MerchantItem'dir (L{cimri.api.cimriservice.merchantitem.MerchantItem})

				  bunun yaninda, her item icin asagidaki veriler de saglanir:

                                  "meta.result"	                : "success" ya da "fail" degerlerinden birini icerir

				  "meta.error"			: eger operasyonda bir hata olusursa, hata hakkinda
								  bilgiler icerir


	def __init__(self):
		#initialize parents

		#supported operations
		self.ops={"update"	:self._task_update}

		#update chunk size

	def _task_update(self):
		"update" islemini calistirir

		self.logger.info("api call...")

		#uddater stats
		self.task.stats["data"]["actions"]={"total":0, "insert": 0, "update":0, "clean":0}		

		#get # of workers
		workers=int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

		#oncomplete handler
		def oncomplete():
			#mark as completed

		def updateitems(items,work):
			#clean task
			data=[item["data"] for item in items if item["meta.action"]=="clean"]
			for index in range(chunks):
					res=yield self._clean(data[index*self._update_size:(index+1)*self._update_size])
					#check result
	                        except Exception as e:

				#update progress		
			#update/insert tasks
			data=[item["data"] for item in items if item["meta.action"] in ["insert","update"]]
			for index in range(chunks):
					res=yield self._update(data[index*self._update_size:(index+1)*self._update_size])
					#check result
	                        except Exception as e:

				#update progress		
			#next item

		#partition data by merchants
		for item in self.task.data:
			if mid not in merchants:

			#record stats

		#figure out progress
		for id in merchants:			
			cleanops=len([item for item in merchants[id] if item["meta.action"]=="clean"])

		#initialize progress tracker
		#distribute tesk
		d.complete()	#complete when all data is processed

	def _update(self,items):
		Belirlenen sayida merchant item'i cimri service'de gunceller

                @type  items: list (L{cimri.api.cimriservice.merchantitem.MerchantItem})
                @param items: guncellenmesi istenilen itemlar

                @rtype: L{twisted.internet.defer.Deferred}
                @return: update sonuclarini kabul edicek bir Deferred objecti

		def update_items(items):
			if res is False:
				self._log_error("cimriservice update failed")
			return res

		#add timestamp and operator id
		ts=time.strftime("%Y-%m-%d %H:%M:%S",datetime.datetime.now().timetuple())
		for item in items:

		#convert unicode values to string such that utf8 character sequences are interpreted as utf8  
		for item in items:
			item.merchantItemTitle = convert_unicode_to_utf8str(item.merchantItemTitle, item.encoding)
			item.brand = convert_unicode_to_utf8str(item.brand, item.encoding)
			item.modelNameView = convert_unicode_to_utf8str(item.modelNameView, item.encoding)

#			f=open("out.txt","a")
#			f.write(item.merchantItemTitle)
#			f.write("\n")
#			f.close()

		return deferToThread(update_items,items)

	def _clean(self,items):
		Belirlenen sayida merchant item'in itemid'lerini cimri service'de temizler

                @type  items: list (L{cimri.api.cimriservice.merchantitem.MerchantItem})
                @param items: guncellenmesi istenilen itemlar

                @rtype: L{twisted.internet.defer.Deferred}
                @return: update sonuclarini kabul edicek bir Deferred objecti

		def clean_items(items):
			if res is False:
				self._log_error("cimriservice clean failed")
			return res

		return deferToThread(clean_items,items)