Esempio n. 1
0
    def process(self, data):
        """
        经与周晔、技术讨论,对排重规则进行调整:
        定义在车源标题、车型、里程数、地区、联系方式、价格、车龄都完全相同的车源,为重复车源
        在前台隐藏不展示,但数据库中不删除,便于后续数据分析
        """
        keys = ("car_title", "car_type", "car_mileage", "car_price",
                "car_birth", "source_province", "source_zone")
        m = hashlib.md5()
        for key in keys:
            if type(data[key]) == unicode:
                mk = data[key].encode('utf-8')
            else:
                mk = str(data[key])
            m.update(mk)
        signature = m.hexdigest()

        if self.rd7.exists(signature):
            for contact in (data["contact_mobile"], data["contact_phone"]):
                if self.rd7.sismember(signature, contact):
                    dr.insert_data(data)
                    self.logger.debug("(%s) has been droped by process77 %s." %
                                      (data['domain'], data['url']))
                    return None

        self.rd7.sadd(signature, data["contact_mobile"], data["contact_phone"])
        return data
Esempio n. 2
0
    def process(self, data):
        """
        经与周晔、技术讨论,对排重规则进行调整:
        定义在车源标题、车型、里程数、地区、联系方式、价格、车龄都完全相同的车源,为重复车源
        在前台隐藏不展示,但数据库中不删除,便于后续数据分析
        """
        keys = ("car_title", "car_type", "car_mileage",
                "car_price", "car_birth", "source_province", "source_zone")
        m = hashlib.md5()
        for key in keys:
            if type(data[key]) == unicode:
                mk = data[key].encode('utf-8')
            else:
                mk = str(data[key])
            m.update(mk)
        signature = m.hexdigest()

        if self.rd7.exists(signature):
            for contact in (data["contact_mobile"], data["contact_phone"]):
                if self.rd7.sismember(signature, contact):
                    dr.insert_data(data)
                    self.logger.debug("(%s) has been droped by process77 %s." % (data['domain'], data['url']))
                    return None

        self.rd7.sadd(signature, data["contact_mobile"], data["contact_phone"])
        return data
Esempio n. 3
0
    def process(self, data):
 
        is_lose = Process66Main._is_car_title_lose(data['car_title'])  
        if is_lose:
            dr.insert_data(data)
            self.logger.debug("(%s) Item ignore, lose car_title." % data['domain'])
            return None
        
        is_lose = Process66Main._is_car_brand_car_series_lose(data['car_brand'], data['car_series'])  
        if is_lose:
            dr.insert_data(data)
            self.logger.debug("(%s) Item ignore, lose car_series." % data['domain'])
            return None
       
        is_lose = Process66Main._is_car_price_lose(data['car_price']) 
        if is_lose:
            dr.insert_data(data)
            self.logger.debug("(%s) Item ignore, lose car_price." % data['domain'])
            return None

        is_lose = Process66Main._is_contact_phone_contact_mobile_contact_mail_contact_qq_lose(data['contact_phone'], data['contact_mobile'], data['contact_mail'], data['contact_qq'])
        if is_lose:
            dr.insert_data(data)
            self.logger.debug("(%s) Item ignore, lose contact." % data['domain'])
            return None
        
        is_lose = Process66Main._is_source_province_source_zone_lose(data['source_province'], data['source_zone'])
        if is_lose:
            dr.insert_data(data)
            self.logger.debug("(%s) Item ignore, lose source." % data['domain'])
            return None
        time.sleep(0.08)

        return  data
Esempio n. 4
0
    def process(self, data):

        is_lose = Process66Main._is_car_title_lose(data['car_title'])
        if is_lose:
            dr.insert_data(data)
            self.logger.debug("(%s) Item ignore, lose car_title." %
                              data['domain'])
            return None

        is_lose = Process66Main._is_car_brand_car_series_lose(
            data['car_brand'], data['car_series'])
        if is_lose:
            dr.insert_data(data)
            self.logger.debug("(%s) Item ignore, lose car_series." %
                              data['domain'])
            return None

        is_lose = Process66Main._is_car_price_lose(data['car_price'])
        if is_lose:
            dr.insert_data(data)
            self.logger.debug("(%s) Item ignore, lose car_price." %
                              data['domain'])
            return None

        is_lose = Process66Main._is_contact_phone_contact_mobile_contact_mail_contact_qq_lose(
            data['contact_phone'], data['contact_mobile'],
            data['contact_mail'], data['contact_qq'])
        if is_lose:
            dr.insert_data(data)
            self.logger.debug("(%s) Item ignore, lose contact." %
                              data['domain'])
            return None

        is_lose = Process66Main._is_source_province_source_zone_lose(
            data['source_province'], data['source_zone'])
        if is_lose:
            dr.insert_data(data)
            self.logger.debug("(%s) Item ignore, lose source." %
                              data['domain'])
            return None
        time.sleep(0.08)

        return data