Ejemplo n.º 1
0
    def filterAndPackageDgrate(self):

        if not OPEN_REDIS_DISTINCT:
            return self.item

        uniqueCodeList = self.item.keys()
        repeatUniqueCode = requstDistinct(uniqueCodeList)
        logging.info('------------distinct before : %s ' % uniqueCodeList)
        for i, unique in enumerate(repeatUniqueCode):
            del (self.item[unique])
        logging.info('------------distinct after : %s ' % self.item.keys())
        return self.item
Ejemplo n.º 2
0
    def filterAndPackageDgrate(self):

        if not OPEN_REDIS_DISTINCT:
            return self.item

        uniqueCodeList = self.item.keys()
        repeatUniqueCode = requstDistinct(uniqueCodeList)
        logging.info('------------distinct before : %s ' % uniqueCodeList)
        for i, unique in enumerate(repeatUniqueCode):
            del(self.item[unique])
        logging.info('------------distinct after : %s ' % self.item.keys())
        return self.item
Ejemplo n.º 3
0
    def distinctRequestUrls(self, urls):

        if len(urls) < 1:
            return []

        if (not OPEN_REDIS_DISTINCT) or self.is_duplicate:
            return list(urls)

        uniqueCodeDict = {}
        for url in urls:
            uniqueCodeDict[toMd5(url)] = url

        repeatUniqueCode = requstDistinct(uniqueCodeDict.keys())
        for i, unique in enumerate(repeatUniqueCode):
            del(uniqueCodeDict[unique])
        return uniqueCodeDict.values()
Ejemplo n.º 4
0
    def distinctRequestUrls(self, urls):

        if len(urls) < 1:
            return []

        if (not OPEN_REDIS_DISTINCT) or self.is_duplicate:
            return list(urls)

        uniqueCodeDict = {}
        for url in urls:
            uniqueCodeDict[toMd5(url)] = url

        repeatUniqueCode = requstDistinct(uniqueCodeDict.keys())
        for i, unique in enumerate(repeatUniqueCode):
            del (uniqueCodeDict[unique])
        return uniqueCodeDict.values()
Ejemplo n.º 5
0
    def distinctRequestUrls(self, urls):

        if len(urls) < 1:
            return []

        uniqueCodeDict = {}
        for url in urls:
            uniqueCodeDict[toMd5(url)] = url

        # logging.info("*********uniqueCodeDict : %s   *****" % uniqueCodeDict)
        repeatUniqueCode = requstDistinct(uniqueCodeDict.keys())
        # logging.info("*********repeatUniqueCode : %s   *****" % repeatUniqueCode)

        for i, unique in enumerate(repeatUniqueCode):
            del(uniqueCodeDict[unique])
        return uniqueCodeDict.values()
Ejemplo n.º 6
0
    def filterAndPackageDgrate(self):

        uniqueCodeList = []
        insertData = {}
        item = self.item

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        for index, title in enumerate(item['title']):

            uniqueCode = toMd5(item['source_url'][index])
            if index < len(item['img_url']) and item['img_url'][index]:
                img_url = json.dumps(item['img_url'][index])
            else:
                img_url = ''

            if index < len(item['description']) and item['description'][index]:
                description = item['description'][index]
            else:
                continue

            title = title.decode('utf8')[0:255].encode('utf8')
            uniqueCodeList.append(uniqueCode)
            insertData[uniqueCode] = {
                'source_url': item['source_url'][index],
                'unique_code': uniqueCode,
                'rule_id': rule_id,
                'title': title,
                'description': description,
                'img_url': img_url,
                'public_time': public_time,
                'create_time': create_time
            }

        if uniqueCodeList and OPEN_REDIS_DISTINCT:
            repeatUniqueCode = requstDistinct(uniqueCodeList)
            for i, unique in enumerate(repeatUniqueCode):
                del (insertData[unique])

        return insertData
Ejemplo n.º 7
0
    def filterAndPackageDgrate(self):

        uniqueCodeList = []
        insertData = {}
        item = self.item

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        for index, title in enumerate(item['title']):

            uniqueCode = toMd5(item['source_url'][index])
            if index < len(item['img_url']) and item['img_url'][index]:
                img_url = json.dumps(item['img_url'][index])
            else:
                img_url = ''

            if index < len(item['description']) and item['description'][index]:
                description = item['description'][index]
            else:
                continue

            title = title.decode('utf8')[0:255].encode('utf8')
            uniqueCodeList.append(uniqueCode)
            insertData[uniqueCode] = {
                'source_url': item['source_url'][index],
                'unique_code': uniqueCode,
                'rule_id': rule_id,
                'title': title,
                'description': description,
                'img_url': img_url,
                'public_time': public_time,
                'create_time': create_time
            }

        if uniqueCodeList and OPEN_REDIS_DISTINCT:
            repeatUniqueCode = requstDistinct(uniqueCodeList)
            for i, unique in enumerate(repeatUniqueCode):
                del(insertData[unique])

        return insertData