def filterAndPackageDgrate(self): if not OPEN_REDIS_DISTINCT: return self.item uniqueCodeList = self.item.keys() repeatUniqueCode = requstDistinct(uniqueCodeList) logging.info('------------distinct before : %s ' % uniqueCodeList) for i, unique in enumerate(repeatUniqueCode): del (self.item[unique]) logging.info('------------distinct after : %s ' % self.item.keys()) return self.item
def filterAndPackageDgrate(self): if not OPEN_REDIS_DISTINCT: return self.item uniqueCodeList = self.item.keys() repeatUniqueCode = requstDistinct(uniqueCodeList) logging.info('------------distinct before : %s ' % uniqueCodeList) for i, unique in enumerate(repeatUniqueCode): del(self.item[unique]) logging.info('------------distinct after : %s ' % self.item.keys()) return self.item
def distinctRequestUrls(self, urls): if len(urls) < 1: return [] if (not OPEN_REDIS_DISTINCT) or self.is_duplicate: return list(urls) uniqueCodeDict = {} for url in urls: uniqueCodeDict[toMd5(url)] = url repeatUniqueCode = requstDistinct(uniqueCodeDict.keys()) for i, unique in enumerate(repeatUniqueCode): del(uniqueCodeDict[unique]) return uniqueCodeDict.values()
def distinctRequestUrls(self, urls): if len(urls) < 1: return [] if (not OPEN_REDIS_DISTINCT) or self.is_duplicate: return list(urls) uniqueCodeDict = {} for url in urls: uniqueCodeDict[toMd5(url)] = url repeatUniqueCode = requstDistinct(uniqueCodeDict.keys()) for i, unique in enumerate(repeatUniqueCode): del (uniqueCodeDict[unique]) return uniqueCodeDict.values()
def distinctRequestUrls(self, urls): if len(urls) < 1: return [] uniqueCodeDict = {} for url in urls: uniqueCodeDict[toMd5(url)] = url # logging.info("*********uniqueCodeDict : %s *****" % uniqueCodeDict) repeatUniqueCode = requstDistinct(uniqueCodeDict.keys()) # logging.info("*********repeatUniqueCode : %s *****" % repeatUniqueCode) for i, unique in enumerate(repeatUniqueCode): del(uniqueCodeDict[unique]) return uniqueCodeDict.values()
def filterAndPackageDgrate(self): uniqueCodeList = [] insertData = {} item = self.item rule_id = item['rule_id'] public_time = int(time.time()) create_time = int(time.time()) for index, title in enumerate(item['title']): uniqueCode = toMd5(item['source_url'][index]) if index < len(item['img_url']) and item['img_url'][index]: img_url = json.dumps(item['img_url'][index]) else: img_url = '' if index < len(item['description']) and item['description'][index]: description = item['description'][index] else: continue title = title.decode('utf8')[0:255].encode('utf8') uniqueCodeList.append(uniqueCode) insertData[uniqueCode] = { 'source_url': item['source_url'][index], 'unique_code': uniqueCode, 'rule_id': rule_id, 'title': title, 'description': description, 'img_url': img_url, 'public_time': public_time, 'create_time': create_time } if uniqueCodeList and OPEN_REDIS_DISTINCT: repeatUniqueCode = requstDistinct(uniqueCodeList) for i, unique in enumerate(repeatUniqueCode): del (insertData[unique]) return insertData
def filterAndPackageDgrate(self): uniqueCodeList = [] insertData = {} item = self.item rule_id = item['rule_id'] public_time = int(time.time()) create_time = int(time.time()) for index, title in enumerate(item['title']): uniqueCode = toMd5(item['source_url'][index]) if index < len(item['img_url']) and item['img_url'][index]: img_url = json.dumps(item['img_url'][index]) else: img_url = '' if index < len(item['description']) and item['description'][index]: description = item['description'][index] else: continue title = title.decode('utf8')[0:255].encode('utf8') uniqueCodeList.append(uniqueCode) insertData[uniqueCode] = { 'source_url': item['source_url'][index], 'unique_code': uniqueCode, 'rule_id': rule_id, 'title': title, 'description': description, 'img_url': img_url, 'public_time': public_time, 'create_time': create_time } if uniqueCodeList and OPEN_REDIS_DISTINCT: repeatUniqueCode = requstDistinct(uniqueCodeList) for i, unique in enumerate(repeatUniqueCode): del(insertData[unique]) return insertData