コード例 #1
0
ファイル: data.py プロジェクト: matcher/glacier
	def from_json(cls,packet):
                """
		Bir json stringe dayanarak yeni bir class instance yaratir

                @type packet:   str
                @param packet:  class fieldlarini iceren bir json string

		@rtype:	object
                @return: eger json parse hatasi yoksa bu classin bir instancei, varsa None 
                """

		#check packet
                if packet==None:
                        return None

                #parse and instantiate objects
                try:
                        #parse
                        data=json.loads(packet)

                        #return object
                        return cls(data)

                except Exception as e:
                        Logger.getlogger(cls.__name__).error("exception ocurred while parsing json for objects ("+cls.__class__.__name__+")")

                return None
コード例 #2
0
ファイル: sandbox.py プロジェクト: matcher/glacier
class SandboxUpdater(Updater):
        """
	SandboxUpdater gelistirme sureci ve testler icindir. Cimri Service'de herhangi bir veri guncellemez.

	Modul tarafindan desteklenen islem turleri ve opsiyonlari su sekildedir:

        "update" islemi: belirlenen update direktiflerini simule eder

                task.data	: islemden gecirilmesi istenilen her item icin "data" o item'in bilgilerini iceren
				  MerchantItem olmalidir (L{cimri.api.cimriservice.merchantitem.MerchantItem})

				  bunun yaninda, her item icin asagidaki veriler de saglanmalidir:

                                  "meta.action"                 : "update" ya da "insert" degerini icermelidir.
								  item uzerinde backendde yapilmasi istenilen
								  operasyonu belirler.

                task.meta       : -

                task.result     : islemden gecirilmesi istenilen her item icin "data" o item icin guncellenmis
				  MerchantItem'dir (L{cimri.api.cimriservice.merchantitem.MerchantItem})

				  bunun yaninda, her item icin asagidaki veriler de saglanir:

                                  "meta.result"	                : "success" ya da "fail" degerlerinden birini icerir

				  "meta.error"			: eger operasyonda bir hata olusursa, hata hakkinda
								  bilgiler icerir

        """

	def __init__(self):
		#initialize parents
		Updater.__init__(self)
		
		self.logger=Logger(self.__class__.__name__)

		#supported operations
		self.ops={"update"	:self._task_update}



	def _task_update(self):
                """
		"update" islemini calistirir
                """

		self.logger.info("api call...")

		#set result
		self.result=[{"data":item["data"], "result":"success"} for item in self.task.data]		
コード例 #3
0
ファイル: dictionary.py プロジェクト: pombredanne/glacier
class DictionaryChunker:
    """
	aho-corasick chunker algoritmasi

	java:

	http://alias-i.com/lingpipe/docs/api/com/aliasi/dict/ExactDictionaryChunker.html

	http://alias-i.com/lingpipe/demos/tutorial/ne/read-me.html


	python:

	http://pypi.python.org/pypi/ahocorasick/0.9

	http://nltk.github.com/api/nltk.chunk.html#id1
	"""

    def __init__(self, dictionary):
        """
                @type  dictionary: list
                @param dictionary: chunker icin kullanilacak terim sozlugu
                """

        self.logger = Logger(self.__class__.__name__)

        # build dictionary
        self.logger.info("generating dictionary...")
        self.tree = ahocorasick.KeywordTree()
        for item in dictionary:
            if item.strip() != "":
                self.tree.add(item)
        self.tree.make()
        self.logger.info("finished generating dictionary...")

    def find_all(self, text):
        """
                bir text icinde belirlenen sozlukte bulunan butun terimleri arar.

                @type  text: string
                @param text: icinde terimlerin aranacagi text

                @rtype: list
                @return: bulunan sozluk terimleri
                """

        return [text[match[0] : match[1]] for match in self.tree.findall(text, allow_overlaps=1)]
コード例 #4
0
ファイル: sandbox.py プロジェクト: matcher/glacier
	def __init__(self):
		#initialize parents
		Updater.__init__(self)
		
		self.logger=Logger(self.__class__.__name__)

		#supported operations
		self.ops={"update"	:self._task_update}
コード例 #5
0
ファイル: xmlscraper.py プロジェクト: matcher/glacier
class XMLScrapper(Scrapper,Web):
        """
	XMLScrapper bir merchant xml'den alinan merchant itemlarini sistem islem akisi kurallari icinde bir 
	scraper modulu tarafindan yaratilmis olarak formatlar. Gercek anlamda bir scraper degildir.

	Modul asagidaki islemleri destekler:

        "scrap"	islemi :

                task.data      	: islem verileri icindeki her item icin "data" o item'in URL olmalidir. ayrica her item icin
				  asagidaki veriler bulunmalidir:

                                  "meta.merchantid"             : item'in merchant IDsi

                                  "meta.xmlitem" 	        : item bilgilerini iceren MerchanItem objecti
								  L{cimri.api.cimriservice.data.merchantitem.MerchantItem}

                task.result     : her item icin "data" o itemin bilgilerini iceren bir MerchanItem olacaktir.

	"""


	def __init__(self):
		#initialize parents
		Web.__init__(self)
		Scrapper.__init__(self)
		
		self.logger=Logger(self.__class__.__name__)

		#supported operations
		self.ops={"scrap"		:self._task_scrap}


	def _task_scrap(self):
		"""
		"scrap" isini calistirir
		"""

		self.logger.info("api call...")

		#translate data	
		self.task.result=[ {"data":item["meta.xmlitem"]} for item in self.task.data ]

		#mark as completed
		self._complete()
コード例 #6
0
ファイル: web.py プロジェクト: matcher/glacier
	def __init__(self):
		#get logger
		self.logger=Logger(self.__class__.__name__)
		
		#get configuration
		self.config=Config.getconfig("WEB")

		#log url errors by default
		self._log_url_faults=False
コード例 #7
0
ファイル: xmlscraper.py プロジェクト: matcher/glacier
	def __init__(self):
		#initialize parents
		Web.__init__(self)
		Scrapper.__init__(self)
		
		self.logger=Logger(self.__class__.__name__)

		#supported operations
		self.ops={"scrap"		:self._task_scrap}
コード例 #8
0
ファイル: data.py プロジェクト: matcher/glacier
        def list_from_json(cls,packet,path):
                """
		Bir json stringe dayanarak bu classin birden fazla instanceini yaratir

                @type packet:   str
                @param packet:  bu class turunde objectlerin listesini iceren bir json string

                @type path:     list
                @param path:    json objecti icinde aranan objectlerin hangi path'de bulundugunu belirleyen bir field listesi
                                ornegin bu parametre ["items"] ise, objectlerin json icindeki yeri asagidaki sekildedir:

                                data=json.loads(packet)
                                items=data["items"]

		@rtype:	list
                @return: eger json parse hatasi yoksa bu classin instancelarini iceren bir liste, varsa None 
                """

		#check packet
                if packet==None:
                        return None

                #parse and instantiate objects
                try:
                        #parse
                        data=json.loads(packet)

                        #find the field containing the list of objects to instantiate
                        for key in path:
                                data=data[key]
			
			#if this is not a list, convert into a list with single element
			if type(data).__name__!='list':
				data=[data]			
			
                        #return objects
                        return [cls(info) for info in data]

                except Exception as e:
                        Logger.getlogger(cls.__name__).error("exception ocurred while parsing json for objects ("+cls.__class__.__name__+")")

                return None
コード例 #9
0
ファイル: httpapi.py プロジェクト: matcher/glacier
	def __init__(self,url):
		"""
		@type	url: string
		@param 	url: API callari icin kullanilacak url 
		"""

		#get logger
		self.logger=Logger.getlogger(self.__class__.__name__)

		#initialize
		super(HttpAPI,self).__init__()
		self.url=url
コード例 #10
0
ファイル: solrapi.py プロジェクト: matcher/glacier
	def __init__(self,url):
		"""
		@type	url: string
		@param 	url: Solr calllari icin kullanilacak URL
		"""

		#get logger
		self.logger=Logger.getlogger(self.__class__.__name__)

		#initialize
		super(AsyncSolrAPI,self).__init__()
		self.url=url
コード例 #11
0
ファイル: config.py プロジェクト: pombredanne/glacier
    def _initialize(cls):
        """
		Sistem konfigurasyonunu tanimlanmis dosyadan yukler
	        """

        Logger.getlogger("Config").info("initializing configuration manager...")

        # initialize config data
        Config.config = {}

        # read configuration
        parser = SafeConfigParser()
        try:
            parser.read("cimri/config/config.ini")
            for section in parser.sections():
                Config.config[section] = {}
                for option in parser.options(section):
                    Config.config[section][option] = parser.get(section, option)

        except Exception as e:
            Logger.getlogger("Config").error("there was an error reading systen configuration file")
コード例 #12
0
ファイル: productxml.py プロジェクト: pombredanne/glacier
    def __init__(self):
        Web.__init__(self)
        ProductCrawler.__init__(self)

        self.logger = Logger(self.__class__.__name__)

        # supported operations
        self.ops = {
            "discover": self._task_discover,
            "crawl": self._task_crawl,
            "sample": self._task_sample,
            "get": self._task_get,
        }
コード例 #13
0
ファイル: dictionary.py プロジェクト: pombredanne/glacier
    def __init__(self, dictionary):
        """
                @type  dictionary: list
                @param dictionary: chunker icin kullanilacak terim sozlugu
                """

        self.logger = Logger(self.__class__.__name__)

        # build dictionary
        self.logger.info("generating dictionary...")
        self.tree = ahocorasick.KeywordTree()
        for item in dictionary:
            if item.strip() != "":
                self.tree.add(item)
        self.tree.make()
        self.logger.info("finished generating dictionary...")
コード例 #14
0
ファイル: merchant.py プロジェクト: matcher/glacier
	def __init__(self,merchant):
                """
		@type merchant:	L{cimri.api.cimriservice.data.merchant.MerchantInfo}
                @param id: merchant 
                """

		#init parent
		super(MerchantXML,self).__init__()

		self.id=id		
		self.logger=Logger.getlogger(self.__class__.__name__)
#		self.url=Config.getconfig("API").get("cimri_merchant_xml_url")+str(merchant.merchantId)
		self.url=merchant.xmlUrl1
		self.xml=None
		self.parser=None
		self.items=[]
		self.encoding=""
コード例 #15
0
ファイル: productxml.py プロジェクト: pombredanne/glacier
class ProductXMLCrawler(ProductCrawler, Web):
    """
	ProductXMLCrawler urun bilgilerini web siteleri dolasarak bulma bakimindan gercek bir crawler degildir.
	Urun bilgilerini Merchant XML'lerden alir, sistem icindeki normal bir web crawlerinin islem akisi
	icinde calistigi ayni seviyede calisir. Sistemin geri kalanina gercek bir crawler olarak gorunur.

	Modul tarafindan desteklenen islem turleri ve opsiyonlari su sekildedir:

	"crawl" islemi: bir ya da daha fazla merchant XML'i tarayarak icinde bulunan merchant itemlari bulur

		task.data	: -

		task.meta	: 
				  "merchants.id"		: aktif merchantlar arasinda aranan cimri-service 
								  merchant IDsi. eger bu opsiyon icin bir deger 
								  verildiyse merchants.index ve merchant.range 
								  opsiyonlari dikkate alinmaz 

       				  "merchants.id.alt"		: butun merchantlar arasinda aranan cimri-service
								  merchant IDsi. eger bu opsiyon icin bir deger 
								  verildiyse merchants.id, merchants.index, ve
								  merchants.range opsiyonlari dikkate alinmaz

       				  "merchants.index"		: aktif merchantlar arasinda crawl islemi
								  icin kullanilacak ilk merchantin indexi.
								  eger bir deger verilmediyse 0 kullanilir

        			  "merchants.range"		: aktif merchantlar arasinda crawl islemi
								  icin kullanilacak merchantlarin sayisi.
								  eger bir deger verilmediye merchants.index
								  indexli merchanttan baslayarak butun merchantlar
								  kullanilir.

        			  "merchants.items.index"	: islem icin kullanilan bir merchantin itemlari 
								  arasinda isleme alinacak ilk itemin indexi.
								  eger bir deger verilmediyse 0 kullanilir.

        			  "merchants.items.range"	: islem icin kullanilan bir merchant icin isleme
								  alinacak itemlarin sayisi. eger bir deger 
								  verilmediyse merchants.items.index indexli
								  itemdan baslayarak butun itemlar isleme alinir.

				  "merchants.all"		: eger bu opsiyon varsa sadece aktif merchantlar
								  degil butun merchantlar islem icin dikkate alinacaktir.

				  "cache.read"			: eger bu opsiyon varsa islem bilgileri cache'den
								  alinacaktir. eger opsiyonun bir degeri varsa
								  cache'in o bolumu kullanilir, eger opsiyonun
								  bir degeri yoksa genel cache kullanilir.

				  "cache.write"			: eger bu opsiyon varsa islem sonuclar cache'e
								  yazilacaktir. eger opsiyonun bir degeri varsa
								  cache'in o bolumu kullanilir, eger opsiyonun
								  bir degeri yoksa genel cache kullanilir.

		task.result	: islem sonucu olarak bulunan her item icin "data" o item'in urli olacaktir.

				  ayrica, her item icin asagidaki meta veriler de sonuclara dahil edilir;

				  "meta.merchantid"		: bulunan item'in mechant IDsi

				  "meta.xmlitem"		: bulunan item bilgilerini iceren MerchantItem
								  L{cimri.api.cimriservice.data.merchantitem.MerchantItem}

	"sample" islemi: bir  merchant XML'i tarayarak random bir sayida merchant item bulur.
			 cesitli testler icin ornek veriler yaratmak icin kullanilir.

		task.data	: -

		task.meta	:
				  "merchants.id"		: aktif merchantlar arasinda aranan cimri-service 
								  merchant IDsi. eger bu opsiyon icin bir deger 
								  verildiyse merchants.index ve merchant.range 
								  opsiyonlari dikkate alinmaz 
       			
				  "sample.size"			: bulunmasi istenilen item sayisi 
                                
				  "cache.read"			: eger bu opsiyon varsa islem bilgileri cache'den
								  alinacaktir. eger opsiyonun bir degeri varsa
								  cache'in o bolumu kullanilir, eger opsiyonun
								  bir degeri yoksa genel cache kullanilir.

				  "cache.write"			: eger bu opsiyon varsa islem sonuclar cache'e
								  yazilacaktir. eger opsiyonun bir degeri varsa
								  cache'in o bolumu kullanilir, eger opsiyonun
								  bir degeri yoksa genel cache kullanilir.

		task.result	: islem sonucu olarak bulunan her item icin "data" o item'in urli olacaktir.

				  ayrica, her item icin asagidaki meta veriler de sonuclara dahil edilir;

				  "meta.merchantid"		: bulunan item'in mechant IDsi

				  "meta.xmlitem"		: bulunan item bilgilerini iceren MerchantItem
								  L{cimri.api.cimriservice.data.merchantitem.MerchantItem}

	"get"   islemi: belirtilen merchant item'lar icin merchant XML'lerinden MerchantItem bilgilerini bulur

                task.data      	: bulunmasi istenen her item icin "data" o item'in merchantId ve merchantItemId'sini
				  iceren MerchantItem objecti olmalidir (L{cimri.api.cimriservice.data.merchantitem.MerchantItem})

		task.meta	: 
				  "cache.read"			: eger bu opsiyon varsa islem bilgileri cache'den
								  alinacaktir. eger opsiyonun bir degeri varsa
								  cache'in o bolumu kullanilir, eger opsiyonun
								  bir degeri yoksa genel cache kullanilir.

				  "cache.write"			: eger bu opsiyon varsa islem sonuclar cache'e
								  yazilacaktir. eger opsiyonun bir degeri varsa
								  cache'in o bolumu kullanilir, eger opsiyonun
								  bir degeri yoksa genel cache kullanilir.

		task.result	: islem sonucu olarak bulunan her item icin "data" o item'in urli olacaktir.

				  ayrica, her item icin asagidaki meta veriler de sonuclara dahil edilir;

				  "meta.merchantid"		: bulunan item'in mechant IDsi

				  "meta.xmlitem"		: bulunan item bilgilerini iceren MerchantItem
								  L{cimri.api.cimriservice.data.merchantitem.MerchantItem}

	"discover" islemi: kullanilmamaktadir.


	"""

    def __init__(self):
        Web.__init__(self)
        ProductCrawler.__init__(self)

        self.logger = Logger(self.__class__.__name__)

        # supported operations
        self.ops = {
            "discover": self._task_discover,
            "crawl": self._task_crawl,
            "sample": self._task_sample,
            "get": self._task_get,
        }

    def _task_discover(self):
        """
		Kullanilmiyor
                """

        pass

    def _task_crawl(self):
        """
		"crawl" isini calistirir
		"""

        self.logger.info("starting crawler...")

        # get number of workers
        workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

        # get list of merchants according to task
        # parmeters: range, status,...
        allmerchants = "merchants.all" in self.task.meta and self.task.meta["merchants.all"] is True
        merchants = self._get_merchants(allmerchants)

        # get range to operate on
        if "merchants.id.alt" in self.task.meta:
            merchants = [
                merchant
                for merchant in merchants
                if str(merchant.merchantId) == str(self.task.meta["merchants.id.alt"])
            ]
        elif "merchants.id" in self.task.meta:
            merchants = [
                merchant for merchant in merchants if str(merchant.merchantId) == str(self.task.meta["merchants.id"])
            ]
        else:
            if "merchants.index" in self.task.meta:
                merchants = merchants[int(self.task.meta["merchants.index"]) :]
            if "merchants.range" in self.task.meta:
                merchants = merchants[: int(self.task.meta["merchants.range"])]

                # progress steps for each merchant
        steps = 10000

        # check items
        @defer.inlineCallbacks
        def checkitems(merchant, work):
            try:
                # reet progress counter
                counter = steps

                # get items for merchant
                try:
                    items = yield self._get_merchant_items(merchant)
                except:
                    items = None
                if items is None:
                    self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl())
                    items = []  # TEMP workaround to make sure task completes

                    # progress counter size
                stepby = steps if len(items) == 0 else float(steps) / len(items)

                # get range to operate on
                if "merchants.items.index" in self.task.meta:
                    items = items[int(self.task.meta["merchants.items.index"]) :]
                if "merchants.items.range" in self.task.meta:
                    items = items[: int(self.task.meta["merchants.items.range"])]

                    # check items
                for item in items:
                    # TEMP - do not check
                    # check page
                    # res=yield self._check_item(item)
                    res = True

                    # set task result
                    if res:
                        self.task.result.append(
                            {"data": item.merchantItemUrl, "meta.merchantid": merchant.merchantId, "meta.xmlitem": item}
                        )
                    else:
                        msg = Template("item(${id}) url could not verified: $url").substitute(
                            id=str(item.merchantItemId), url=item.merchantItemUrl
                        )
                        self._log_error(msg)

                    self._progress(stepby=stepby)
                    counter = counter - stepby

            except Exception as e:
                data = Template("exception while processing merchant(${id}): $url").substitute(
                    id=str(merchant.merchantId), url=MerchantXML(merchant).geturl()
                )
                self._log_exception(e, data=data)
                self._fail()

                # update progress
            self._progress(stepby=counter)

            # next item
            work.next()

            # initialize progress tracker

        self._progress(len(merchants) * steps)

        # start task distributor
        d = Distributor(checkitems, oncomplete=self._complete, workers=workers)
        d.run()

        # process each merchant
        d.adddata(merchants)

        # when done processing all data
        d.complete()

    @defer.inlineCallbacks
    def _task_sample(self):
        """
		"sample" isini calistirir
		"""

        self.logger.info("starting crawler 'sample' task...")

        # get number of workers
        workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

        # get list of merchants
        merchants = self._get_merchants()

        # apply filters
        if "merchants.id" in self.task.meta:
            merchants = [
                merchant
                for merchant in merchants
                if str(merchant.merchantId).strip() == str(self.task.meta["merchants.id"]).strip()
            ]
        self.count = int(self.task.meta["sample.size"]) if "sample.size" in self.task.meta else 100

        # process each merchant
        for merchant in merchants:
            try:
                # get items for merchant
                items = yield self._get_merchant_items(merchant)
                if items is None:
                    self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl())
                    items = []  # TEMP workaround to make sure task completes

                    # shuffle the items
                shuffle(items)

                # check items
                @defer.inlineCallbacks
                def checkitem(item, work):
                    try:
                        # check page
                        res = yield self._check_item(item)

                        # check if task was completed while we were waiting to get the item
                        if work.isactive():
                            if res:
                                # set task result
                                self.task.result.append(
                                    {
                                        "data": item.merchantItemUrl,
                                        "meta.merchantid": merchant.merchantId,
                                        "meta.xmlitem": item,
                                    }
                                )

                                # update progress
                                self._progress()

                                # done?
                                self.count = self.count - 1
                                if self.count < 1:
                                    work.complete()

                            else:
                                msg = Template("item(${id}) url could not verified: $url").substitute(
                                    id=str(item.merchantItemId), url=item.merchantItemUrl
                                )
                                self._log_error(msg)

                    except Exception as e:
                        data = Template("exception while processing item(${id}): $url").substitute(
                            id=str(merchant.merchantId), url=item.merchantItemUrl
                        )
                        self._log_exception(e, data=data)

                        # next item
                    work.next()

                    # initialize progress tracker

                self._progress(self.count)

                # distribute tesk
                d = Distributor(checkitem, oncomplete=self._complete, workers=workers)
                d.run()
                d.adddata(items)
                d.complete()  # when done processing all data

            except Exception as e:
                data = Template("exception while processing merchant(${id}): $url").substitute(
                    id=str(merchant.merchantId), url=MerchantXML(merchant).geturl()
                )
                self._log_exception(e, data=data)
                self._fail()

    def _task_get(self):
        """
		"get" isini calistirir
		"""

        self.logger.info("starting crawler...")

        # get number of workers
        workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

        # get merchant ids we're looking for
        merchantids = list(set([it.merchant["merchantId"] for it in self.task.data]))

        # get list of merchants according to task
        merchants = []
        for id in merchantids:
            merchant = self._get_merchant(id)
            if merchant is not None:
                merchants.append(merchant)

                # worker to check items

        @defer.inlineCallbacks
        def checkitems(merchant, work):
            try:
                # get item ids we're looking for
                ids = [it.merchantItemId for it in self.task.data if it.merchant["merchantId"] == merchant.merchantId]

                # get items for merchant
                items = yield self._get_merchant_items(merchant)
                if items is None:
                    self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl())
                    items = []  # TEMP workaround to make sure task completes

                    # get merchant items to sample for merchant
                items = [item for item in items if item.merchantItemId in ids]

                # process items
                for item in items:
                    # TEMP - do not check
                    # check page
                    # res=yield self._check_item(item)
                    res = True

                    # set task result
                    if res:
                        self.task.result.append(
                            {"data": item.merchantItemUrl, "meta.merchantid": merchant.merchantId, "meta.xmlitem": item}
                        )
                    else:
                        msg = Template("item(${id}) url could not verified: $url").substitute(
                            id=str(item.merchantItemId), url=item.merchantItemUrl
                        )
                        self._log_error(msg)

                        # update progress
                    self._progress()

            except Exception as e:
                data = Template("exception while processing merchant(${id}): $url").substitute(
                    id=str(merchant.merchantId), url=MerchantXML(merchant).geturl()
                )
                self._log_exception(e, data)

                # next item
            work.next()

            # initialize progress tracker

        self._progress(len(self.task.data))

        # start task distributor
        d = Distributor(checkitems, oncomplete=self._complete, workers=workers)
        d.run()

        # process each merchant
        d.adddata([merchant for merchant in merchants if merchant.merchantId in merchantids])

        # when done processing all data
        d.complete()

    def _get_merchants(self, all=False):
        """
		Cimri Service'den merchantlarin listesini alir

		@type all:	bool
		@param all:	eger dogru ise butun merchantlar alinir, aksi takdirde sadece aktif merchantlar alinir

		@rtype:	list (L{cimri.api.cimriservice.data.merchant.MerchantInfo})
		@return: merchant listesi
                """

        self.logger.info("getting merchants...")

        # get active merchants
        api = MerchantsAPI()
        if all is True:
            merchants = api.get_merchants()
        else:
            merchants = api.get_merchants(status=MerchantInfo.STATUS_ACTIVE)
        if merchants is None:
            self._log_error("error getting cimri service merchant list")
            self.logger.warn("did not get any merchants from cimri-service")
            return []

        self.logger.info("number of merchants retrieved: " + str(len(merchants)))

        # get only the active ones
        # merchants=[merchant for merchant in merchants if merchant.status==MerchantInfo.STATUS_ACTIVE]

        self.logger.info("number of active merchants found: " + str(len(merchants)))

        return merchants

    def _get_merchant(self, id):
        """
		Cimri service'den belli bir merchanti alir

		@type id: str
		@param id: merchant ID

		@rtype: L{cimri.api.cimriservice.data.merchant.MerchantInfo}
		@return: MerchantInfo objecti
                """

        self.logger.info("getting merchant..." + str(id))

        # get active merchants
        api = MerchantsAPI()
        merchant = api.get_merchant(id)
        if merchant is None:
            self._log_error("error getting cimri service merchant " + str(id))
            self.logger.warn("did not get merchant from cimri-service")
            return None

        return merchant

    def _get_merchant_items(self, merchant):
        """
		Bir merchant icin butun merchant itemlari asynchronous olarak cimri serviceden alir

		@type merchant:	L{cimri.api.cimriservice.data.merchant.MerchantInfo}
		@param merchant: itemlari istenen merchant

		@rtype: L{twisted.internet.defer.Deferred}
		@return: merchant itemlarin yollanacagi Deferred
                """

        return deferToThread(self._get_merchant_items_async, merchant)

    def _get_merchant_items_async(self, merchant):
        """
		Bir merchant icin butun merchant itemlari cimri serviceden alir

		@type merchant:	L{cimri.api.cimriservice.data.merchant.MerchantInfo}
		@param merchant: itemlari istenen merchant

                @rtype: list (L{cimri.api.cimriservice.data.merchantitem.MerchantItem})
                @return: istenen merchant itemlar
                """

        self.logger.info("getting merchant items for ... " + str(merchant.merchantId) + ":" + merchant.merchantName)

        # check cache
        items = self._get_cached_merchant_items(merchant.merchantId)
        if items is None:
            # load items for merchant
            xml = MerchantXML(merchant)
            res = xml.load()
            if res is False:
                return None

                # get items
            items = xml.getitems()

            # cache
            self._cache_merchant_items(items, merchant.merchantId)

        # add merchant id
        for item in items:
            item.merchant = {"merchantId": merchant.merchantId}

        return items

    def _check_item(self, item):
        """
		Bir merchant item'a ait URL'in erisilir olup olmadigini asynchronous olarak kontrol eder

                @type item: L{cimri.api.cimriservice.data.merchantitem.MerchantItem}
                @param item: kontrol edilmesi istenen merchant item

		@rtype: L{twisted.internet.defer.Deferred}
		@return: operasyon sonucunun yollanacagi Deferred
		"""

        return deferToThread(self.ping, item.merchantItemUrl)

    def _cache_merchant_items(self, items, merchantid):
        """
		Belirlenen merchant itemlari cache'e kayit eder

                @type items: list (L{cimri.api.cimriservice.data.merchantitem.MerchantItem})
                @param items: kayit edilmesi istenen merchant itemlar

                @type merchantid: str
                @param merchantid: itemlarin ait oldugu merchantin IDsi
                """

        # check if we should cache
        if "cache.write" not in self.task.meta:
            return

            # get section to cache to
        section = self.task.meta["cache.write"]

        # write to cache
        Cache(section).set(
            "crawler.productxml.items." + str(merchantid), json.dumps([item.to_dict() for item in items])
        )

    def _get_cached_merchant_items(self, merchantid):
        """
		Belirli bir merchant icin cache'de kayitli merchant itemlari alir

                @type merchantid: str
                @param merchantid: itemlarin ait oldugu merchantin IDsi

                @rtype: list (L{cimri.api.cimriservice.data.merchantitem.MerchantItem})
                @return: cache'de bulunan itemlarin listesi. cache bolumu yoksa ya da bir hata olusursa None
                """

        # check if we should use the cache
        if "cache.read" not in self.task.meta:
            return None

            # get section to use
        section = self.task.meta["cache.read"]

        # read cache
        content = Cache(section).get("crawler.productxml.items." + str(merchantid))
        if content is None:
            return None

            # parse
        try:
            return MerchantItem.list_from_json(content, [])

        except Exception as e:
            return None
コード例 #16
0
ファイル: module.py プロジェクト: matcher/glacier
""" 
コード例 #17
0
ファイル: web.py プロジェクト: matcher/glacier
class Web(object):
	"""
	HTTP operasyonlari icin kullanilir 
        """

	def __init__(self):
		#get logger
		self.logger=Logger(self.__class__.__name__)
		
		#get configuration
		self.config=Config.getconfig("WEB")

		#log url errors by default
		self._log_url_faults=False


	def ping(self,url):
		"""
		Bir URL'in erisilir olup olmadigini kontrol eder

		@type url:	str
		@param url:	test edilmesi istenen URL

		@rtype:	L{cimri.system.web.WebReport}
		@return: ping sonuclarini iceren WebReport objecti
		"""

		return self.get(url,ping=True)
		


	def get(self,url,unicode=True,download=False,file=None,ping=False,cache=None,timeout=None):
		"""
		Bir URL'deki contenti yuklemek ya da dosya olarak indirmek icin kullanilir

		@type url:	str
		@param url:	acilmasi istenen URL

		@type unicode:	bool
		@param unicode: URLdeki contentin unicode olarak varsayilip sayilmamasi gerektigini kontrol eder

		@type download:	bool
		@param download:True ise URLdeki content bir dosya olarak indirilir, aksi takdirde content string olarak doner

		@type file:	str
		@param file:	URLdeki content dosya olarak indirildiyse dosyanin path ve ismi

		@type ping:	bool
		@param ping:	sadece URLin erisilir olup olmadigini kontrol eder. herhangi bir content yuklenmez ya da indirilmez.

		@type cache:	dict
		@param cache:	URL islemleri ile ilgili cache operasyonlarini kontrol eder. eger None ise herhangi bir cache operasyonu
				yapilmaz.

				eger "read" keyi varsa cache dictionarysinde, istenen content URL yerine bulunursda belirtilen cache
				bolumunden okunur. eger "write" keyi varsa, URLden alinan content belirtilen cache bolumune yazilir.

		@rtype:	L{cimri.system.web.WebReport}
		@return: sonuclari iceren WebReport objecti
		"""

		#initialize report
		report=WebReport(url)
	
		#download file
		f=None

		try:
			#get timeout
			timeout=int(self.config.get("url_open_timeout")) if timeout is None else timeout

			#create url resoure
			res=URL(url)

			#ping only?
			if ping:
				#open and check if url is accessible
	                	res.open(timeout=timeout)
	
			#download?
			elif download:
				#record file name
				report.file=file				

				#open file to save to
				f=open(file,'w')

				#download and write
				f.write(res.download(timeout=timeout, cached=False))

			elif unicode:
				#use cached version?
				if cache is not None and "read" in cache:
					report.content=Cache(cache["read"]).get("web.url."+hash_url(url))

				#download url (if not looking for a cached version or if the cached version not found)
				if report.content is None:
					report.content=res.download(timeout=timeout, cached=False)
				
					#write to cache?
					if cache is not None and "write" in cache:
						Cache(cache["write"]).set("web.url."+hash_url(url),report.content)

			else:
				#use cached version?
				if cache is not None and "read" in cache:
					report.content=Cache(cache["read"]).get("web.url."+hash_url(url))

				#download url (if not looking for a cached version or if the cached version not found)
				if report.content is None:
		                	res.open(timeout=timeout)
					report.content=res.download(cached=False,timeout=timeout)
					#report.content=res.read()
				
					#write to cache?
					if cache is not None and "write" in cache:
						Cache(cache["write"]).set("web.url."+hash_url(url),report.content)
				
                except HTTP400BadRequest as e:
			report.error=WebError("exception ocurred opening url. bad request",400,url)
                        self.log(str(report.error))
                        self.log(str(e))

                except HTTP401Authentication as e:
			report.error=WebError("exception ocurred opening url. url requires authentication",401,url)
                        self.log(str(report.error))
                        self.log(str(e))

                except HTTP403Forbidden as e:
			report.error=WebError("exception ocurred opening url. url not accessible",403,url)
                        self.log(str(report.error))
                        self.log(str(e))

                except HTTP404NotFound as e:
			report.error=WebError("exception ocurred opening url. not found",404,url)
                        self.log(str(report.error))
                        self.log(str(e))

                except HTTPError as e:
			report.error=WebError("exception ocurred opening url",None,url)
                        self.log(str(report.error))
                        self.log(str(e))

                except URLError as e:
			report.error=WebError("exception ocurred opening url. url contains errors",None,url)
                        self.log(str(report.error))
                        self.log(str(e))

                except URLTimeout as e:
			report.error=WebError("exception ocurred opening url. url load timed out",None,url)
                        self.log(str(report.error))
                        self.log(str(e))

		except IOError as e:
			report.error=FileError("exception ocurred writing to file",file)
                        self.log(str(report.error))
                        self.log(str(e))

		except Exception as e:
			report.error=WebError("exception ocurred",None,url)
                        self.log(str(report.error))
                        self.log(str(e))
		
		finally:
			if f!=None:
				f.close()

		#add information
		report.headers=res.headers
		report.query=res.query if (res!=None and res.query!=None) else None
		report.content_url=res.redirect if (res!=None and res.redirect!=None) else report.content_url

		return report


	def log(self,msg):
		if self._log_url_faults is True:
			self.logger.error(msg)
コード例 #18
0
ファイル: cimriservice.py プロジェクト: matcher/glacier
class CimriServiceUpdater(Updater):
        """
	CimriServiceUpdater sistem icindeki diger modullerin belirlenen sonuclarini alir ve CimriService uzerinde 
	belirlenen itemlar icin guncellemeler yapar.

	Modul tarafindan desteklenen islem turleri ve opsiyonlari su sekildedir:

        "update" islemi: belirlenen itemlari Cimri Service'de update eder ya da ekler

                task.data	: islemden gecirilmesi istenilen her item icin "data" o item'in bilgilerini iceren
				  MerchantItem olmalidir (L{cimri.api.cimriservice.merchantitem.MerchantItem})

				  bunun yaninda, her item icin asagidaki veriler de saglanmalidir:

                                  "meta.action"                 : "update" ya da "insert" degerini icermelidir.
								  item uzerinde backendde yapilmasi istenilen
								  operasyonu belirler.

                task.meta       : -

                task.result     : islemden gecirilmesi istenilen her item icin "data" o item icin guncellenmis
				  MerchantItem'dir (L{cimri.api.cimriservice.merchantitem.MerchantItem})

				  bunun yaninda, her item icin asagidaki veriler de saglanir:

                                  "meta.result"	                : "success" ya da "fail" degerlerinden birini icerir

				  "meta.error"			: eger operasyonda bir hata olusursa, hata hakkinda
								  bilgiler icerir

        """

	def __init__(self):
		#initialize parents
		Updater.__init__(self)
		
		self.logger=Logger(self.__class__.__name__)

		#supported operations
		self.ops={"update"	:self._task_update}

		#update chunk size
		self._update_size=1000


	def _task_update(self):
                """
		"update" islemini calistirir
                """

		self.logger.info("api call...")

		#uddater stats
		self.task.stats["data"]["actions"]={"total":0, "insert": 0, "update":0, "clean":0}		

		#get # of workers
		workers=int(self.task.meta["workers"]) if "workers" in self.task.meta else 1

		#oncomplete handler
		def oncomplete():
			#mark as completed
			self._complete()

		#update
		@defer.inlineCallbacks
		def updateitems(items,work):
			#clean task
			data=[item["data"] for item in items if item["meta.action"]=="clean"]
			chunks=int(math.ceil(float(len(data))/self._update_size))
			for index in range(chunks):
				try:
					#update
					res=yield self._clean(data[index*self._update_size:(index+1)*self._update_size])
			
					#check result
	
	                        except Exception as e:
                	                self._log_exception(e,data=None)

				#update progress		
				self._progress()
	
			#update/insert tasks
			data=[item["data"] for item in items if item["meta.action"] in ["insert","update"]]
			chunks=int(math.ceil(float(len(data))/self._update_size))
			for index in range(chunks):
				try:
					#update
					res=yield self._update(data[index*self._update_size:(index+1)*self._update_size])
			
					#check result
	
	                        except Exception as e:
                	                self._log_exception(e,data=None)

				#update progress		
				self._progress()
	
			#next item
			work.next()
			

		#partition data by merchants
		merchants={}
		for item in self.task.data:
			mid=item["data"].merchant["merchantId"]
			if mid not in merchants:
				merchants[mid]=[]
			merchants[mid].append(item)

			#record stats
			self.task.stats["data"]["actions"]["total"]=self.task.stats["data"]["actions"]["total"]+1
			self.task.stats["data"]["actions"][item["meta.action"]]+=1

		#figure out progress
		size=0
		for id in merchants:			
			cleanops=len([item for item in merchants[id] if item["meta.action"]=="clean"])
			size=size+int(math.ceil(float(cleanops)/self._update_size))
			size=size+int(math.ceil(float(len(merchants[id])-cleanops)/self._update_size))

		#initialize progress tracker
		self._progress(size)
			
		#distribute tesk
		d=Distributor(updateitems,oncomplete=oncomplete,workers=workers)
		d.run()
		d.adddata(merchants.values())
		d.complete()	#complete when all data is processed


	def _update(self,items):
                """
		Belirlenen sayida merchant item'i cimri service'de gunceller

                @type  items: list (L{cimri.api.cimriservice.merchantitem.MerchantItem})
                @param items: guncellenmesi istenilen itemlar

                @rtype: L{twisted.internet.defer.Deferred}
                @return: update sonuclarini kabul edicek bir Deferred objecti
                """

		def update_items(items):
			api=MerchantsAPI()
			res=api.update_items(items)	
			if res is False:
				self._log_error("cimriservice update failed")
			return res

		#add timestamp and operator id
		ts=time.strftime("%Y-%m-%d %H:%M:%S",datetime.datetime.now().timetuple())
		for item in items:
			item.lastUpdateDate=ts
			item.operator={"operatorId":0}

		#convert unicode values to string such that utf8 character sequences are interpreted as utf8  
		for item in items:
			item.merchantItemTitle = convert_unicode_to_utf8str(item.merchantItemTitle, item.encoding)
			item.brand = convert_unicode_to_utf8str(item.brand, item.encoding)
			item.modelNameView = convert_unicode_to_utf8str(item.modelNameView, item.encoding)

#			f=open("out.txt","a")
#			f.write(item.merchantItemTitle)
#			f.write("\n")
#			f.close()

		#update
		return deferToThread(update_items,items)


	def _clean(self,items):
                """
		Belirlenen sayida merchant item'in itemid'lerini cimri service'de temizler

                @type  items: list (L{cimri.api.cimriservice.merchantitem.MerchantItem})
                @param items: guncellenmesi istenilen itemlar

                @rtype: L{twisted.internet.defer.Deferred}
                @return: update sonuclarini kabul edicek bir Deferred objecti
                """

		def clean_items(items):
			api=MerchantsAPI()
			res=api.clean_paused_items(items)	
			if res is False:
				self._log_error("cimriservice clean failed")
			return res

		#update
		return deferToThread(clean_items,items)