def _task_match(self): """ "match" islemini calistirir. """ self.logger.info("starting matcher...") #get # of workers workers=int(self.task.meta["workers"]) if "workers" in self.task.meta else 1 #run in test mode? self.test=("test" in self.task.meta and self.task.meta["test"] is True) #additional stats self.task.stats["test"]["guessed"]=0 #oncomplete handler def oncomplete(): #finalize tests if self.test: self._test_finalize() #mark as completed self._complete() #match items self.task.results=[] @defer.inlineCallbacks def matchitem(item,work): try: #match result=yield self._match_item(item["data"]) #comparison test if self.test: #comparet to reference match and update test stats self._test_update(result,item["meta.refitem"]) #include in results result["meta.refitem"]=item["meta.refitem"] #add to results if result["meta.action"] is not None: self.task.result.append(result) except Exception as e: data=Template("item(${id}): $url").substitute(id=replace_turkish_chars(item["data"].merchantItemId),url=item["data"].merchantItemUrl) self._log_exception(e,data=data) #update progress self._progress() #next item work.next() #initialize progress tracker self._progress(len(self.task.data)) #distribute tesk d=Distributor(matchitem,oncomplete=oncomplete,workers=workers) d.run() d.adddata(self.task.data) d.complete() #complete when all data is processed
def _task_get(self): """ "get" isini calistirir """ self.logger.info("starting crawler...") # get number of workers workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1 # get merchant ids we're looking for merchantids = list(set([it.merchant["merchantId"] for it in self.task.data])) # get list of merchants according to task merchants = [] for id in merchantids: merchant = self._get_merchant(id) if merchant is not None: merchants.append(merchant) # worker to check items @defer.inlineCallbacks def checkitems(merchant, work): try: # get item ids we're looking for ids = [it.merchantItemId for it in self.task.data if it.merchant["merchantId"] == merchant.merchantId] # get items for merchant items = yield self._get_merchant_items(merchant) if items is None: self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl()) items = [] # TEMP workaround to make sure task completes # get merchant items to sample for merchant items = [item for item in items if item.merchantItemId in ids] # process items for item in items: # TEMP - do not check # check page # res=yield self._check_item(item) res = True # set task result if res: self.task.result.append( {"data": item.merchantItemUrl, "meta.merchantid": merchant.merchantId, "meta.xmlitem": item} ) else: msg = Template("item(${id}) url could not verified: $url").substitute( id=str(item.merchantItemId), url=item.merchantItemUrl ) self._log_error(msg) # update progress self._progress() except Exception as e: data = Template("exception while processing merchant(${id}): $url").substitute( id=str(merchant.merchantId), url=MerchantXML(merchant).geturl() ) self._log_exception(e, data) # next item work.next() # initialize progress tracker self._progress(len(self.task.data)) # start task distributor d = Distributor(checkitems, oncomplete=self._complete, workers=workers) d.run() # process each merchant d.adddata([merchant for merchant in merchants if merchant.merchantId in merchantids]) # when done processing all data d.complete()
def _task_crawl(self): """ "crawl" isini calistirir """ self.logger.info("starting crawler...") # get number of workers workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1 # get list of merchants according to task # parmeters: range, status,... allmerchants = "merchants.all" in self.task.meta and self.task.meta["merchants.all"] is True merchants = self._get_merchants(allmerchants) # get range to operate on if "merchants.id.alt" in self.task.meta: merchants = [ merchant for merchant in merchants if str(merchant.merchantId) == str(self.task.meta["merchants.id.alt"]) ] elif "merchants.id" in self.task.meta: merchants = [ merchant for merchant in merchants if str(merchant.merchantId) == str(self.task.meta["merchants.id"]) ] else: if "merchants.index" in self.task.meta: merchants = merchants[int(self.task.meta["merchants.index"]) :] if "merchants.range" in self.task.meta: merchants = merchants[: int(self.task.meta["merchants.range"])] # progress steps for each merchant steps = 10000 # check items @defer.inlineCallbacks def checkitems(merchant, work): try: # reet progress counter counter = steps # get items for merchant try: items = yield self._get_merchant_items(merchant) except: items = None if items is None: self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl()) items = [] # TEMP workaround to make sure task completes # progress counter size stepby = steps if len(items) == 0 else float(steps) / len(items) # get range to operate on if "merchants.items.index" in self.task.meta: items = items[int(self.task.meta["merchants.items.index"]) :] if "merchants.items.range" in self.task.meta: items = items[: int(self.task.meta["merchants.items.range"])] # check items for item in items: # TEMP - do not check # check page # res=yield self._check_item(item) res = True # set task result if res: self.task.result.append( {"data": item.merchantItemUrl, "meta.merchantid": merchant.merchantId, "meta.xmlitem": item} ) else: msg = Template("item(${id}) url could not verified: $url").substitute( id=str(item.merchantItemId), url=item.merchantItemUrl ) self._log_error(msg) self._progress(stepby=stepby) counter = counter - stepby except Exception as e: data = Template("exception while processing merchant(${id}): $url").substitute( id=str(merchant.merchantId), url=MerchantXML(merchant).geturl() ) self._log_exception(e, data=data) self._fail() # update progress self._progress(stepby=counter) # next item work.next() # initialize progress tracker self._progress(len(merchants) * steps) # start task distributor d = Distributor(checkitems, oncomplete=self._complete, workers=workers) d.run() # process each merchant d.adddata(merchants) # when done processing all data d.complete()
def _task_sample(self): """ "sample" isini calistirir """ self.logger.info("starting crawler 'sample' task...") # get number of workers workers = int(self.task.meta["workers"]) if "workers" in self.task.meta else 1 # get list of merchants merchants = self._get_merchants() # apply filters if "merchants.id" in self.task.meta: merchants = [ merchant for merchant in merchants if str(merchant.merchantId).strip() == str(self.task.meta["merchants.id"]).strip() ] self.count = int(self.task.meta["sample.size"]) if "sample.size" in self.task.meta else 100 # process each merchant for merchant in merchants: try: # get items for merchant items = yield self._get_merchant_items(merchant) if items is None: self._log_error("failed to load merchant xml " + MerchantXML(merchant).geturl()) items = [] # TEMP workaround to make sure task completes # shuffle the items shuffle(items) # check items @defer.inlineCallbacks def checkitem(item, work): try: # check page res = yield self._check_item(item) # check if task was completed while we were waiting to get the item if work.isactive(): if res: # set task result self.task.result.append( { "data": item.merchantItemUrl, "meta.merchantid": merchant.merchantId, "meta.xmlitem": item, } ) # update progress self._progress() # done? self.count = self.count - 1 if self.count < 1: work.complete() else: msg = Template("item(${id}) url could not verified: $url").substitute( id=str(item.merchantItemId), url=item.merchantItemUrl ) self._log_error(msg) except Exception as e: data = Template("exception while processing item(${id}): $url").substitute( id=str(merchant.merchantId), url=item.merchantItemUrl ) self._log_exception(e, data=data) # next item work.next() # initialize progress tracker self._progress(self.count) # distribute tesk d = Distributor(checkitem, oncomplete=self._complete, workers=workers) d.run() d.adddata(items) d.complete() # when done processing all data except Exception as e: data = Template("exception while processing merchant(${id}): $url").substitute( id=str(merchant.merchantId), url=MerchantXML(merchant).geturl() ) self._log_exception(e, data=data) self._fail()
def _task_update(self): """ "update" islemini calistirir """ self.logger.info("api call...") #uddater stats self.task.stats["data"]["actions"]={"total":0, "insert": 0, "update":0, "clean":0} #get # of workers workers=int(self.task.meta["workers"]) if "workers" in self.task.meta else 1 #oncomplete handler def oncomplete(): #mark as completed self._complete() #update @defer.inlineCallbacks def updateitems(items,work): #clean task data=[item["data"] for item in items if item["meta.action"]=="clean"] chunks=int(math.ceil(float(len(data))/self._update_size)) for index in range(chunks): try: #update res=yield self._clean(data[index*self._update_size:(index+1)*self._update_size]) #check result except Exception as e: self._log_exception(e,data=None) #update progress self._progress() #update/insert tasks data=[item["data"] for item in items if item["meta.action"] in ["insert","update"]] chunks=int(math.ceil(float(len(data))/self._update_size)) for index in range(chunks): try: #update res=yield self._update(data[index*self._update_size:(index+1)*self._update_size]) #check result except Exception as e: self._log_exception(e,data=None) #update progress self._progress() #next item work.next() #partition data by merchants merchants={} for item in self.task.data: mid=item["data"].merchant["merchantId"] if mid not in merchants: merchants[mid]=[] merchants[mid].append(item) #record stats self.task.stats["data"]["actions"]["total"]=self.task.stats["data"]["actions"]["total"]+1 self.task.stats["data"]["actions"][item["meta.action"]]+=1 #figure out progress size=0 for id in merchants: cleanops=len([item for item in merchants[id] if item["meta.action"]=="clean"]) size=size+int(math.ceil(float(cleanops)/self._update_size)) size=size+int(math.ceil(float(len(merchants[id])-cleanops)/self._update_size)) #initialize progress tracker self._progress(size) #distribute tesk d=Distributor(updateitems,oncomplete=oncomplete,workers=workers) d.run() d.adddata(merchants.values()) d.complete() #complete when all data is processed