Example #1
0
    def process(self, item):
        seg_rule = [("producted_year", u"(\d{2,4}).+[年款]?", 1),
                    ("logo_zh", u"^.+[版型级]", 0),
                    ("transmission_zh", u"(手动)|(自动)|(手波)|(手自一体)|(无极变速)|CVT|([AM]T)|(A[^T]+T)", 0),
                    ("engine", u"(\d\.\d?)(?![\d|万])(L|l|T|t|升|CVT|TSI|TFSI)?", 1),
                    ("imports_zh", u"(进口)|(国产)|([\u2E80-\u9FFF]+)国", 0)]
        vid = item['vehicle_code']
        seg_rule.extend([('car_brand', item['car_brand'], 0),
                         ('car_series', item['car_series'], 0)])
        sseg = segment(seg_rule, item['car_title'])
        cursor = conn.cursor()
        cursor.execute("select producted_year, logo_zh, engine, transmission_zh from car_datas where vehicle_code = ?", (vid, ))
        rb_rs = cursor.fetchone()
        rb = dict(zip([i[0] for i in cursor.description], rb_rs))
        
        #年款
        car_publish_logo = sseg['producted_year']['content']
        if not car_publish_logo:
            car_publish_logo = rb['producted_year']

        def repl(obj):
            num = obj.group()
            inum = int(num)
            if inum < 1000:
                if inum < 80:
                    y = inum + 2000
                else:
                    y = inum + 1900
            else:
                y = inum

            if 1985 < y < 2014:
                return str(y)

        if car_publish_logo:
            car_publish_logo = re.sub('\d+', repl, car_publish_logo)
        #版型
        car_publish_version = sseg['logo_zh']['content']
        if not car_publish_version:
            car_publish_version = rb['logo_zh'] or ''
        #排量
        car_emission = sseg['engine']['content']
        if not car_emission:
            car_emission = rb['engine']
        #车型短描述
        keywords = [item['car_brand'],
                    item['car_series'],
                    car_publish_logo,
                    car_publish_version,
                    rb['transmission_zh'],
                    car_emission]
        
        standard_title = ' '.join([k for k in keywords if k])

        item['car_publish_logo'] = car_publish_logo
        item['car_publish_version'] = car_publish_version
        item['standard_title'] = standard_title

        return item
Example #2
0
    def process(self, item):
        seg_rule = [
            ("producted_year", u"(\d{2,4}).+[年款]?", 1),
            ("logo_zh", u"^.+[版型级]", 0),
            ("transmission_zh", u"(手动)|(自动)|(手波)|(手自一体)|(无极变速)|CVT|([AM]T)|(A[^T]+T)", 0),
            ("engine", u"(\d\.\d?)(?![\d|万])(L|l|T|t|升|CVT|TSI|TFSI)?", 1),
            ("imports_zh", u"(进口)|(国产)|([\u2E80-\u9FFF]+)国", 0),
        ]
        vid = item["vehicle_code"]
        seg_rule.extend([("car_brand", item["car_brand"], 0), ("car_series", item["car_series"], 0)])
        sseg = segment(seg_rule, item["car_title"])
        cursor = conn.cursor()
        cursor.execute(
            "select producted_year, logo_zh, engine, transmission_zh from car_datas where vehicle_code = ?", (vid,)
        )
        rb_rs = cursor.fetchone()
        rb = dict(zip([i[0] for i in cursor.description], rb_rs))

        # 年款
        car_publish_logo = sseg["producted_year"]["content"]
        if not car_publish_logo:
            car_publish_logo = rb["producted_year"]

        def repl(obj):
            num = obj.group()
            inum = int(num)
            if inum < 1000:
                if inum < 80:
                    y = inum + 2000
                else:
                    y = inum + 1900
            else:
                y = inum

            if 1985 < y < 2014:
                return str(y)

        if car_publish_logo:
            car_publish_logo = re.sub("\d+", repl, car_publish_logo)
        # 版型
        car_publish_version = sseg["logo_zh"]["content"]
        if not car_publish_version:
            car_publish_version = rb["logo_zh"] or ""
        # 排量
        car_emission = sseg["engine"]["content"]
        if not car_emission:
            car_emission = rb["engine"]
        # 车型短描述
        keywords = [
            item["car_brand"],
            item["car_series"],
            car_publish_logo,
            car_publish_version,
            rb["transmission_zh"],
            car_emission,
        ]

        standard_title = " ".join([k for k in keywords if k])

        item["car_publish_logo"] = car_publish_logo
        item["car_publish_version"] = car_publish_version
        item["standard_title"] = standard_title

        return item
Example #3
0
 def simple_segment(self, title):
     """正则匹配提取
     """
     rs = segment(self.seg_rule, title)
     results = dict((k, v.get("content")) for k, v in rs.items() if v.get("content"))
     return results