def from_json(cls,packet): """ Bir json stringe dayanarak yeni bir class instance yaratir @type packet: str @param packet: class fieldlarini iceren bir json string @rtype: object @return: eger json parse hatasi yoksa bu classin bir instancei, varsa None """ #check packet if packet==None: return None #parse and instantiate objects try: #parse data=json.loads(packet) #return object return cls(data) except Exception as e: Logger.getlogger(cls.__name__).error("exception ocurred while parsing json for objects ("+cls.__class__.__name__+")") return None
class SandboxUpdater(Updater): """ SandboxUpdater gelistirme sureci ve testler icindir. Cimri Service'de herhangi bir veri guncellemez. Modul tarafindan desteklenen islem turleri ve opsiyonlari su sekildedir: "update" islemi: belirlenen update direktiflerini simule eder task.data : islemden gecirilmesi istenilen her item icin "data" o item'in bilgilerini iceren MerchantItem olmalidir (L{cimri.api.cimriservice.merchantitem.MerchantItem}) bunun yaninda, her item icin asagidaki veriler de saglanmalidir: "meta.action" : "update" ya da "insert" degerini icermelidir. item uzerinde backendde yapilmasi istenilen operasyonu belirler. task.meta : - task.result : islemden gecirilmesi istenilen her item icin "data" o item icin guncellenmis MerchantItem'dir (L{cimri.api.cimriservice.merchantitem.MerchantItem}) bunun yaninda, her item icin asagidaki veriler de saglanir: "meta.result" : "success" ya da "fail" degerlerinden birini icerir "meta.error" : eger operasyonda bir hata olusursa, hata hakkinda bilgiler icerir """ def __init__(self): #initialize parents Updater.__init__(self) self.logger=Logger(self.__class__.__name__) #supported operations self.ops={"update" :self._task_update} def _task_update(self): """ "update" islemini calistirir """ self.logger.info("api call...") #set result self.result=[{"data":item["data"], "result":"success"} for item in self.task.data]
class DictionaryChunker: """ aho-corasick chunker algoritmasi java: http://alias-i.com/lingpipe/docs/api/com/aliasi/dict/ExactDictionaryChunker.html http://alias-i.com/lingpipe/demos/tutorial/ne/read-me.html python: http://pypi.python.org/pypi/ahocorasick/0.9 http://nltk.github.com/api/nltk.chunk.html#id1 """ def __init__(self, dictionary): """ @type dictionary: list @param dictionary: chunker icin kullanilacak terim sozlugu """ self.logger = Logger(self.__class__.__name__) # build dictionary self.logger.info("generating dictionary...") self.tree = ahocorasick.KeywordTree() for item in dictionary: if item.strip() != "": self.tree.add(item) self.tree.make() self.logger.info("finished generating dictionary...") def find_all(self, text): """ bir text icinde belirlenen sozlukte bulunan butun terimleri arar. @type text: string @param text: icinde terimlerin aranacagi text @rtype: list @return: bulunan sozluk terimleri """ return [text[match[0] : match[1]] for match in self.tree.findall(text, allow_overlaps=1)]
def __init__(self): #initialize parents Updater.__init__(self) self.logger=Logger(self.__class__.__name__) #supported operations self.ops={"update" :self._task_update}
class XMLScrapper(Scrapper,Web): """ XMLScrapper bir merchant xml'den alinan merchant itemlarini sistem islem akisi kurallari icinde bir scraper modulu tarafindan yaratilmis olarak formatlar. Gercek anlamda bir scraper degildir. Modul asagidaki islemleri destekler: "scrap" islemi : task.data : islem verileri icindeki her item icin "data" o item'in URL olmalidir. ayrica her item icin asagidaki veriler bulunmalidir: "meta.merchantid" : item'in merchant IDsi "meta.xmlitem" : item bilgilerini iceren MerchanItem objecti L{cimri.api.cimriservice.data.merchantitem.MerchantItem} task.result : her item icin "data" o itemin bilgilerini iceren bir MerchanItem olacaktir. """ def __init__(self): #initialize parents Web.__init__(self) Scrapper.__init__(self) self.logger=Logger(self.__class__.__name__) #supported operations self.ops={"scrap" :self._task_scrap} def _task_scrap(self): """ "scrap" isini calistirir """ self.logger.info("api call...") #translate data self.task.result=[ {"data":item["meta.xmlitem"]} for item in self.task.data ] #mark as completed self._complete()
def __init__(self): #get logger self.logger=Logger(self.__class__.__name__) #get configuration self.config=Config.getconfig("WEB") #log url errors by default self._log_url_faults=False
def __init__(self): #initialize parents Web.__init__(self) Scrapper.__init__(self) self.logger=Logger(self.__class__.__name__) #supported operations self.ops={"scrap" :self._task_scrap}
def list_from_json(cls,packet,path): """ Bir json stringe dayanarak bu classin birden fazla instanceini yaratir @type packet: str @param packet: bu class turunde objectlerin listesini iceren bir json string @type path: list @param path: json objecti icinde aranan objectlerin hangi path'de bulundugunu belirleyen bir field listesi ornegin bu parametre ["items"] ise, objectlerin json icindeki yeri asagidaki sekildedir: data=json.loads(packet) items=data["items"] @rtype: list @return: eger json parse hatasi yoksa bu classin instancelarini iceren bir liste, varsa None """ #check packet if packet==None: return None #parse and instantiate objects try: #parse data=json.loads(packet) #find the field containing the list of objects to instantiate for key in path: data=data[key] #if this is not a list, convert into a list with single element if type(data).__name__!='list': data=[data] #return objects return [cls(info) for info in data] except Exception as e: Logger.getlogger(cls.__name__).error("exception ocurred while parsing json for objects ("+cls.__class__.__name__+")") return None
def __init__(self,url): """ @type url: string @param url: API callari icin kullanilacak url """ #get logger self.logger=Logger.getlogger(self.__class__.__name__) #initialize super(HttpAPI,self).__init__() self.url=url
def __init__(self,url): """ @type url: string @param url: Solr calllari icin kullanilacak URL """ #get logger self.logger=Logger.getlogger(self.__class__.__name__) #initialize super(AsyncSolrAPI,self).__init__() self.url=url
def _initialize(cls): """ Sistem konfigurasyonunu tanimlanmis dosyadan yukler """ Logger.getlogger("Config").info("initializing configuration manager...") # initialize config data Config.config = {} # read configuration parser = SafeConfigParser() try: parser.read("cimri/config/config.ini") for section in parser.sections(): Config.config[section] = {} for option in parser.options(section): Config.config[section][option] = parser.get(section, option) except Exception as e: Logger.getlogger("Config").error("there was an error reading systen configuration file")
def __init__(self): Web.__init__(self) ProductCrawler.__init__(self) self.logger = Logger(self.__class__.__name__) # supported operations self.ops = { "discover": self._task_discover, "crawl": self._task_crawl, "sample": self._task_sample, "get": self._task_get, }
def __init__(self, dictionary): """ @type dictionary: list @param dictionary: chunker icin kullanilacak terim sozlugu """ self.logger = Logger(self.__class__.__name__) # build dictionary self.logger.info("generating dictionary...") self.tree = ahocorasick.KeywordTree() for item in dictionary: if item.strip() != "": self.tree.add(item) self.tree.make() self.logger.info("finished generating dictionary...")
def __init__(self,merchant): """ @type merchant: L{cimri.api.cimriservice.data.merchant.MerchantInfo} @param id: merchant """ #init parent super(MerchantXML,self).__init__() self.id=id self.logger=Logger.getlogger(self.__class__.__name__) # self.url=Config.getconfig("API").get("cimri_merchant_xml_url")+str(merchant.merchantId) self.url=merchant.xmlUrl1 self.xml=None self.parser=None self.items=[] self.encoding=""
class ProductXMLCrawler(ProductCrawler, Web): """ ProductXMLCrawler urun bilgilerini web siteleri dolasarak bulma bakimindan gercek bir crawler degildir. Urun bilgilerini Merchant XML'lerden alir, sistem icindeki normal bir web crawlerinin islem akisi icinde calistigi ayni seviyede calisir. Sistemin geri kalanina gercek bir crawler olarak gorunur. Modul tarafindan desteklenen islem turleri ve opsiyonlari su sekildedir: "crawl" islemi: bir ya da daha fazla merchant XML'i tarayarak icinde bulunan merchant itemlari bulur task.data : - task.meta : "merchants.id" : aktif merchantlar arasinda aranan cimri-service merchant IDsi. eger bu opsiyon icin bir deger verildiyse merchants.index ve merchant.range opsiyonlari dikkate alinmaz "merchants.id.alt" : butun merchantlar arasinda aranan cimri-service merchant IDsi. eger bu opsiyon icin bir deger verildiyse merchants.id, merchants.index, ve merchants.range opsiyonlari dikkate alinmaz "merchants.index" : aktif merchantlar arasinda crawl islemi icin kullanilacak ilk merchantin indexi. eger bir deger verilmediyse 0 kullanilir "merchants.range" : aktif merchantlar arasinda crawl islemi icin kullanilacak merchantlarin sayisi. eger bir deger verilmediye merchants.index indexli merchanttan baslayarak butun merchantlar kullanilir. "merchants.items.index" : islem icin kullanilan bir merchantin itemlari arasinda isleme alinacak ilk itemin indexi. eger bir deger verilmediyse 0 kullanilir. "merchants.items.range" : islem icin kullanilan bir merchant icin isleme alinacak itemlarin sayisi. eger bir deger verilmediyse merchants.items.index indexli itemdan baslayarak butun itemlar isleme alinir. "merchants.all" : eger bu opsiyon varsa sadece aktif merchantlar degil butun merchantlar islem icin dikkate alinacaktir. "cache.read" : eger bu opsiyon varsa islem bilgileri cache'den alinacaktir. eger opsiyonun bir degeri varsa cache'in o bolumu kullanilir, eger opsiyonun bir degeri yoksa genel cache kullanilir. "cache.write" : eger bu opsiyon varsa islem sonuclar cache'e yazilacaktir. eger opsiyonun bir degeri varsa cache'in o bolumu kullanilir, eger opsiyonun bir degeri yoksa genel cache kullanilir. task.result : islem sonucu olarak bulunan her item icin "data" o item'in urli olacaktir. ayrica, her item icin asagidaki meta veriler de sonuclara dahil edilir; "meta.merchantid" : bulunan item'in mechant IDsi "meta.xmlitem" : bulunan item bilgilerini iceren MerchantItem L{cimri.api.cimriservice.data.merchantitem.MerchantItem} "sample" islemi: bir merchant XML'i tarayarak random bir sayida merchant item bulur. cesitli testler icin ornek veriler yaratmak icin kullanilir. task.data : - task.meta : "merchants.id" : aktif merchantlar arasinda aranan cimri-service merchant IDsi. eger bu opsiyon icin bir deger verildiyse merchants.index ve merchant.range opsiyonlari dikkate alinmaz "sample.size" : bulunmasi istenilen item sayisi "cache.read" : eger bu opsiyon varsa islem bilgileri cache'den alinacaktir. eger opsiyonun bir degeri varsa cache'in o bolumu kullanilir, eger opsiyonun bir degeri yoksa genel cache kullanilir. "cache.write" : eger bu opsiyon varsa islem sonuclar cache'e yazilacaktir. eger opsiyonun bir degeri varsa cache'in o bolumu kullanilir, eger opsiyonun bir degeri yoksa genel cache kullanilir. task.result : islem sonucu olarak bulunan her item icin "data" o item'in urli olacaktir. ayrica, her item icin asagidaki meta veriler de sonuclara dahil edilir; "meta.merchantid" : bulunan item'in mechant IDsi "meta.xmlitem" : bulunan item bilgilerini iceren MerchantItem L{cimri.api.cimriservice.data.merchantitem.MerchantItem} "get" islemi: belirtilen merchant item'lar icin merchant XML'lerinden MerchantItem bilgilerini bulur task.data : bulunmasi istenen her item icin "data" o item'in merchantId ve merchantItemId'sini iceren MerchantItem objecti olmalidir (L{cimri.api.cimriservice.data.merchantitem.MerchantItem}) task.meta : "cache.read" : eger bu opsiyon varsa islem bilgileri cache'den alinacaktir. eger opsiyonun bir degeri varsa cache'in o bolumu kullanilir, eger opsiyonun bir degeri yoksa genel cache kullanilir. "cache.write" : eger bu opsiyon varsa islem sonuclar cache'e yazilacaktir. eger opsiyonun bir degeri varsa cache'in o bolumu kullanilir, eger opsiyonun bir degeri yoksa genel cache kullanilir. task.result : islem sonucu olarak bulunan her item icin "data" o item'in urli olacaktir. ayrica, her item icin asagidaki meta veriler de sonuclara dahil edilir; "meta.merchantid" : bulunan item'in mechant IDsi "meta.xmlitem" : bulunan item bilgilerini iceren MerchantItem L{cimri.api.cimriservice.data.merchantitem.MerchantItem} "discover" islemi: kullanilmamaktadir. """ def __init__(self): Web.__init__(self) ProductCrawler.__init__(self) self.logger = Logger(self.__class__.__name__) # supported operations self.ops = { "discover": self._task_discover, "crawl": self._task_crawl, "sample": self._task_sample, "get": self._task_get, } def _task_discover(self): """ Kullanilmiyor """ pass def _task_crawl(self): """ "crawl" isini calistirir """ self.logger.info("starting crawler...") # get number of workers workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1 # get list of merchants according to task # parmeters: range, status,... allmerchants = "merchants.all" in self.task.meta and self.task.meta["merchants.all"] is True merchants = self._get_merchants(allmerchants) # get range to operate on if "merchants.id.alt" in self.task.meta: merchants = [ merchant for merchant in merchants if str(merchant.merchantId) == str(self.task.meta["merchants.id.alt"]) ] elif "merchants.id" in self.task.meta: merchants = [ merchant for merchant in merchants if str(merchant.merchantId) == str(self.task.meta["merchants.id"]) ] else: if "merchants.index" in self.task.meta: merchants = merchants[int(self.task.meta["merchants.index"]) :] if "merchants.range" in self.task.meta: merchants = merchants[: int(self.task.meta["merchants.range"])] # progress steps for each merchant steps = 10000 # check items @defer.inlineCallbacks def checkitems(merchant, work): try: # reet progress counter counter = steps # get items for merchant try: items = yield self._get_merchant_items(merchant) except: items = None if items is None: self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl()) items = [] # TEMP workaround to make sure task completes # progress counter size stepby = steps if len(items) == 0 else float(steps) / len(items) # get range to operate on if "merchants.items.index" in self.task.meta: items = items[int(self.task.meta["merchants.items.index"]) :] if "merchants.items.range" in self.task.meta: items = items[: int(self.task.meta["merchants.items.range"])] # check items for item in items: # TEMP - do not check # check page # res=yield self._check_item(item) res = True # set task result if res: self.task.result.append( {"data": item.merchantItemUrl, "meta.merchantid": merchant.merchantId, "meta.xmlitem": item} ) else: msg = Template("item(${id}) url could not verified: $url").substitute( id=str(item.merchantItemId), url=item.merchantItemUrl ) self._log_error(msg) self._progress(stepby=stepby) counter = counter - stepby except Exception as e: data = Template("exception while processing merchant(${id}): $url").substitute( id=str(merchant.merchantId), url=MerchantXML(merchant).geturl() ) self._log_exception(e, data=data) self._fail() # update progress self._progress(stepby=counter) # next item work.next() # initialize progress tracker self._progress(len(merchants) * steps) # start task distributor d = Distributor(checkitems, oncomplete=self._complete, workers=workers) d.run() # process each merchant d.adddata(merchants) # when done processing all data d.complete() @defer.inlineCallbacks def _task_sample(self): """ "sample" isini calistirir """ self.logger.info("starting crawler 'sample' task...") # get number of workers workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1 # get list of merchants merchants = self._get_merchants() # apply filters if "merchants.id" in self.task.meta: merchants = [ merchant for merchant in merchants if str(merchant.merchantId).strip() == str(self.task.meta["merchants.id"]).strip() ] self.count = int(self.task.meta["sample.size"]) if "sample.size" in self.task.meta else 100 # process each merchant for merchant in merchants: try: # get items for merchant items = yield self._get_merchant_items(merchant) if items is None: self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl()) items = [] # TEMP workaround to make sure task completes # shuffle the items shuffle(items) # check items @defer.inlineCallbacks def checkitem(item, work): try: # check page res = yield self._check_item(item) # check if task was completed while we were waiting to get the item if work.isactive(): if res: # set task result self.task.result.append( { "data": item.merchantItemUrl, "meta.merchantid": merchant.merchantId, "meta.xmlitem": item, } ) # update progress self._progress() # done? self.count = self.count - 1 if self.count < 1: work.complete() else: msg = Template("item(${id}) url could not verified: $url").substitute( id=str(item.merchantItemId), url=item.merchantItemUrl ) self._log_error(msg) except Exception as e: data = Template("exception while processing item(${id}): $url").substitute( id=str(merchant.merchantId), url=item.merchantItemUrl ) self._log_exception(e, data=data) # next item work.next() # initialize progress tracker self._progress(self.count) # distribute tesk d = Distributor(checkitem, oncomplete=self._complete, workers=workers) d.run() d.adddata(items) d.complete() # when done processing all data except Exception as e: data = Template("exception while processing merchant(${id}): $url").substitute( id=str(merchant.merchantId), url=MerchantXML(merchant).geturl() ) self._log_exception(e, data=data) self._fail() def _task_get(self): """ "get" isini calistirir """ self.logger.info("starting crawler...") # get number of workers workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1 # get merchant ids we're looking for merchantids = list(set([it.merchant["merchantId"] for it in self.task.data])) # get list of merchants according to task merchants = [] for id in merchantids: merchant = self._get_merchant(id) if merchant is not None: merchants.append(merchant) # worker to check items @defer.inlineCallbacks def checkitems(merchant, work): try: # get item ids we're looking for ids = [it.merchantItemId for it in self.task.data if it.merchant["merchantId"] == merchant.merchantId] # get items for merchant items = yield self._get_merchant_items(merchant) if items is None: self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl()) items = [] # TEMP workaround to make sure task completes # get merchant items to sample for merchant items = [item for item in items if item.merchantItemId in ids] # process items for item in items: # TEMP - do not check # check page # res=yield self._check_item(item) res = True # set task result if res: self.task.result.append( {"data": item.merchantItemUrl, "meta.merchantid": merchant.merchantId, "meta.xmlitem": item} ) else: msg = Template("item(${id}) url could not verified: $url").substitute( id=str(item.merchantItemId), url=item.merchantItemUrl ) self._log_error(msg) # update progress self._progress() except Exception as e: data = Template("exception while processing merchant(${id}): $url").substitute( id=str(merchant.merchantId), url=MerchantXML(merchant).geturl() ) self._log_exception(e, data) # next item work.next() # initialize progress tracker self._progress(len(self.task.data)) # start task distributor d = Distributor(checkitems, oncomplete=self._complete, workers=workers) d.run() # process each merchant d.adddata([merchant for merchant in merchants if merchant.merchantId in merchantids]) # when done processing all data d.complete() def _get_merchants(self, all=False): """ Cimri Service'den merchantlarin listesini alir @type all: bool @param all: eger dogru ise butun merchantlar alinir, aksi takdirde sadece aktif merchantlar alinir @rtype: list (L{cimri.api.cimriservice.data.merchant.MerchantInfo}) @return: merchant listesi """ self.logger.info("getting merchants...") # get active merchants api = MerchantsAPI() if all is True: merchants = api.get_merchants() else: merchants = api.get_merchants(status=MerchantInfo.STATUS_ACTIVE) if merchants is None: self._log_error("error getting cimri service merchant list") self.logger.warn("did not get any merchants from cimri-service") return [] self.logger.info("number of merchants retrieved: " + str(len(merchants))) # get only the active ones # merchants=[merchant for merchant in merchants if merchant.status==MerchantInfo.STATUS_ACTIVE] self.logger.info("number of active merchants found: " + str(len(merchants))) return merchants def _get_merchant(self, id): """ Cimri service'den belli bir merchanti alir @type id: str @param id: merchant ID @rtype: L{cimri.api.cimriservice.data.merchant.MerchantInfo} @return: MerchantInfo objecti """ self.logger.info("getting merchant..." + str(id)) # get active merchants api = MerchantsAPI() merchant = api.get_merchant(id) if merchant is None: self._log_error("error getting cimri service merchant " + str(id)) self.logger.warn("did not get merchant from cimri-service") return None return merchant def _get_merchant_items(self, merchant): """ Bir merchant icin butun merchant itemlari asynchronous olarak cimri serviceden alir @type merchant: L{cimri.api.cimriservice.data.merchant.MerchantInfo} @param merchant: itemlari istenen merchant @rtype: L{twisted.internet.defer.Deferred} @return: merchant itemlarin yollanacagi Deferred """ return deferToThread(self._get_merchant_items_async, merchant) def _get_merchant_items_async(self, merchant): """ Bir merchant icin butun merchant itemlari cimri serviceden alir @type merchant: L{cimri.api.cimriservice.data.merchant.MerchantInfo} @param merchant: itemlari istenen merchant @rtype: list (L{cimri.api.cimriservice.data.merchantitem.MerchantItem}) @return: istenen merchant itemlar """ self.logger.info("getting merchant items for ... " + str(merchant.merchantId) + ":" + merchant.merchantName) # check cache items = self._get_cached_merchant_items(merchant.merchantId) if items is None: # load items for merchant xml = MerchantXML(merchant) res = xml.load() if res is False: return None # get items items = xml.getitems() # cache self._cache_merchant_items(items, merchant.merchantId) # add merchant id for item in items: item.merchant = {"merchantId": merchant.merchantId} return items def _check_item(self, item): """ Bir merchant item'a ait URL'in erisilir olup olmadigini asynchronous olarak kontrol eder @type item: L{cimri.api.cimriservice.data.merchantitem.MerchantItem} @param item: kontrol edilmesi istenen merchant item @rtype: L{twisted.internet.defer.Deferred} @return: operasyon sonucunun yollanacagi Deferred """ return deferToThread(self.ping, item.merchantItemUrl) def _cache_merchant_items(self, items, merchantid): """ Belirlenen merchant itemlari cache'e kayit eder @type items: list (L{cimri.api.cimriservice.data.merchantitem.MerchantItem}) @param items: kayit edilmesi istenen merchant itemlar @type merchantid: str @param merchantid: itemlarin ait oldugu merchantin IDsi """ # check if we should cache if "cache.write" not in self.task.meta: return # get section to cache to section = self.task.meta["cache.write"] # write to cache Cache(section).set( "crawler.productxml.items." + str(merchantid), json.dumps([item.to_dict() for item in items]) ) def _get_cached_merchant_items(self, merchantid): """ Belirli bir merchant icin cache'de kayitli merchant itemlari alir @type merchantid: str @param merchantid: itemlarin ait oldugu merchantin IDsi @rtype: list (L{cimri.api.cimriservice.data.merchantitem.MerchantItem}) @return: cache'de bulunan itemlarin listesi. cache bolumu yoksa ya da bir hata olusursa None """ # check if we should use the cache if "cache.read" not in self.task.meta: return None # get section to use section = self.task.meta["cache.read"] # read cache content = Cache(section).get("crawler.productxml.items." + str(merchantid)) if content is None: return None # parse try: return MerchantItem.list_from_json(content, []) except Exception as e: return None
"""
class Web(object): """ HTTP operasyonlari icin kullanilir """ def __init__(self): #get logger self.logger=Logger(self.__class__.__name__) #get configuration self.config=Config.getconfig("WEB") #log url errors by default self._log_url_faults=False def ping(self,url): """ Bir URL'in erisilir olup olmadigini kontrol eder @type url: str @param url: test edilmesi istenen URL @rtype: L{cimri.system.web.WebReport} @return: ping sonuclarini iceren WebReport objecti """ return self.get(url,ping=True) def get(self,url,unicode=True,download=False,file=None,ping=False,cache=None,timeout=None): """ Bir URL'deki contenti yuklemek ya da dosya olarak indirmek icin kullanilir @type url: str @param url: acilmasi istenen URL @type unicode: bool @param unicode: URLdeki contentin unicode olarak varsayilip sayilmamasi gerektigini kontrol eder @type download: bool @param download:True ise URLdeki content bir dosya olarak indirilir, aksi takdirde content string olarak doner @type file: str @param file: URLdeki content dosya olarak indirildiyse dosyanin path ve ismi @type ping: bool @param ping: sadece URLin erisilir olup olmadigini kontrol eder. herhangi bir content yuklenmez ya da indirilmez. @type cache: dict @param cache: URL islemleri ile ilgili cache operasyonlarini kontrol eder. eger None ise herhangi bir cache operasyonu yapilmaz. eger "read" keyi varsa cache dictionarysinde, istenen content URL yerine bulunursda belirtilen cache bolumunden okunur. eger "write" keyi varsa, URLden alinan content belirtilen cache bolumune yazilir. @rtype: L{cimri.system.web.WebReport} @return: sonuclari iceren WebReport objecti """ #initialize report report=WebReport(url) #download file f=None try: #get timeout timeout=int(self.config.get("url_open_timeout")) if timeout is None else timeout #create url resoure res=URL(url) #ping only? if ping: #open and check if url is accessible res.open(timeout=timeout) #download? elif download: #record file name report.file=file #open file to save to f=open(file,'w') #download and write f.write(res.download(timeout=timeout, cached=False)) elif unicode: #use cached version? if cache is not None and "read" in cache: report.content=Cache(cache["read"]).get("web.url."+hash_url(url)) #download url (if not looking for a cached version or if the cached version not found) if report.content is None: report.content=res.download(timeout=timeout, cached=False) #write to cache? if cache is not None and "write" in cache: Cache(cache["write"]).set("web.url."+hash_url(url),report.content) else: #use cached version? if cache is not None and "read" in cache: report.content=Cache(cache["read"]).get("web.url."+hash_url(url)) #download url (if not looking for a cached version or if the cached version not found) if report.content is None: res.open(timeout=timeout) report.content=res.download(cached=False,timeout=timeout) #report.content=res.read() #write to cache? if cache is not None and "write" in cache: Cache(cache["write"]).set("web.url."+hash_url(url),report.content) except HTTP400BadRequest as e: report.error=WebError("exception ocurred opening url. bad request",400,url) self.log(str(report.error)) self.log(str(e)) except HTTP401Authentication as e: report.error=WebError("exception ocurred opening url. url requires authentication",401,url) self.log(str(report.error)) self.log(str(e)) except HTTP403Forbidden as e: report.error=WebError("exception ocurred opening url. url not accessible",403,url) self.log(str(report.error)) self.log(str(e)) except HTTP404NotFound as e: report.error=WebError("exception ocurred opening url. not found",404,url) self.log(str(report.error)) self.log(str(e)) except HTTPError as e: report.error=WebError("exception ocurred opening url",None,url) self.log(str(report.error)) self.log(str(e)) except URLError as e: report.error=WebError("exception ocurred opening url. url contains errors",None,url) self.log(str(report.error)) self.log(str(e)) except URLTimeout as e: report.error=WebError("exception ocurred opening url. url load timed out",None,url) self.log(str(report.error)) self.log(str(e)) except IOError as e: report.error=FileError("exception ocurred writing to file",file) self.log(str(report.error)) self.log(str(e)) except Exception as e: report.error=WebError("exception ocurred",None,url) self.log(str(report.error)) self.log(str(e)) finally: if f!=None: f.close() #add information report.headers=res.headers report.query=res.query if (res!=None and res.query!=None) else None report.content_url=res.redirect if (res!=None and res.redirect!=None) else report.content_url return report def log(self,msg): if self._log_url_faults is True: self.logger.error(msg)
class CimriServiceUpdater(Updater): """ CimriServiceUpdater sistem icindeki diger modullerin belirlenen sonuclarini alir ve CimriService uzerinde belirlenen itemlar icin guncellemeler yapar. Modul tarafindan desteklenen islem turleri ve opsiyonlari su sekildedir: "update" islemi: belirlenen itemlari Cimri Service'de update eder ya da ekler task.data : islemden gecirilmesi istenilen her item icin "data" o item'in bilgilerini iceren MerchantItem olmalidir (L{cimri.api.cimriservice.merchantitem.MerchantItem}) bunun yaninda, her item icin asagidaki veriler de saglanmalidir: "meta.action" : "update" ya da "insert" degerini icermelidir. item uzerinde backendde yapilmasi istenilen operasyonu belirler. task.meta : - task.result : islemden gecirilmesi istenilen her item icin "data" o item icin guncellenmis MerchantItem'dir (L{cimri.api.cimriservice.merchantitem.MerchantItem}) bunun yaninda, her item icin asagidaki veriler de saglanir: "meta.result" : "success" ya da "fail" degerlerinden birini icerir "meta.error" : eger operasyonda bir hata olusursa, hata hakkinda bilgiler icerir """ def __init__(self): #initialize parents Updater.__init__(self) self.logger=Logger(self.__class__.__name__) #supported operations self.ops={"update" :self._task_update} #update chunk size self._update_size=1000 def _task_update(self): """ "update" islemini calistirir """ self.logger.info("api call...") #uddater stats self.task.stats["data"]["actions"]={"total":0, "insert": 0, "update":0, "clean":0} #get # of workers workers=int(self.task.meta["workers"]) if "workers" in self.task.meta else 1 #oncomplete handler def oncomplete(): #mark as completed self._complete() #update @defer.inlineCallbacks def updateitems(items,work): #clean task data=[item["data"] for item in items if item["meta.action"]=="clean"] chunks=int(math.ceil(float(len(data))/self._update_size)) for index in range(chunks): try: #update res=yield self._clean(data[index*self._update_size:(index+1)*self._update_size]) #check result except Exception as e: self._log_exception(e,data=None) #update progress self._progress() #update/insert tasks data=[item["data"] for item in items if item["meta.action"] in ["insert","update"]] chunks=int(math.ceil(float(len(data))/self._update_size)) for index in range(chunks): try: #update res=yield self._update(data[index*self._update_size:(index+1)*self._update_size]) #check result except Exception as e: self._log_exception(e,data=None) #update progress self._progress() #next item work.next() #partition data by merchants merchants={} for item in self.task.data: mid=item["data"].merchant["merchantId"] if mid not in merchants: merchants[mid]=[] merchants[mid].append(item) #record stats self.task.stats["data"]["actions"]["total"]=self.task.stats["data"]["actions"]["total"]+1 self.task.stats["data"]["actions"][item["meta.action"]]+=1 #figure out progress size=0 for id in merchants: cleanops=len([item for item in merchants[id] if item["meta.action"]=="clean"]) size=size+int(math.ceil(float(cleanops)/self._update_size)) size=size+int(math.ceil(float(len(merchants[id])-cleanops)/self._update_size)) #initialize progress tracker self._progress(size) #distribute tesk d=Distributor(updateitems,oncomplete=oncomplete,workers=workers) d.run() d.adddata(merchants.values()) d.complete() #complete when all data is processed def _update(self,items): """ Belirlenen sayida merchant item'i cimri service'de gunceller @type items: list (L{cimri.api.cimriservice.merchantitem.MerchantItem}) @param items: guncellenmesi istenilen itemlar @rtype: L{twisted.internet.defer.Deferred} @return: update sonuclarini kabul edicek bir Deferred objecti """ def update_items(items): api=MerchantsAPI() res=api.update_items(items) if res is False: self._log_error("cimriservice update failed") return res #add timestamp and operator id ts=time.strftime("%Y-%m-%d %H:%M:%S",datetime.datetime.now().timetuple()) for item in items: item.lastUpdateDate=ts item.operator={"operatorId":0} #convert unicode values to string such that utf8 character sequences are interpreted as utf8 for item in items: item.merchantItemTitle = convert_unicode_to_utf8str(item.merchantItemTitle, item.encoding) item.brand = convert_unicode_to_utf8str(item.brand, item.encoding) item.modelNameView = convert_unicode_to_utf8str(item.modelNameView, item.encoding) # f=open("out.txt","a") # f.write(item.merchantItemTitle) # f.write("\n") # f.close() #update return deferToThread(update_items,items) def _clean(self,items): """ Belirlenen sayida merchant item'in itemid'lerini cimri service'de temizler @type items: list (L{cimri.api.cimriservice.merchantitem.MerchantItem}) @param items: guncellenmesi istenilen itemlar @rtype: L{twisted.internet.defer.Deferred} @return: update sonuclarini kabul edicek bir Deferred objecti """ def clean_items(items): api=MerchantsAPI() res=api.clean_paused_items(items) if res is False: self._log_error("cimriservice clean failed") return res #update return deferToThread(clean_items,items)