Beispiel #1
0
def SaveData(table_name="",
             brand="",
             series="",
             conf="",
             status="",
             URL_="",
             index=""):
    conf = json.loads(conf)
    for (k, v) in conf.items():
        spaceid = k
        models = v[u"车型名称"]
        if models == '-':
            continue
        mth = re.compile(r'(.*)(20\d\d)(.*)')
        y = re.search(mth, models)
        if y:
            year = int(y.group(2))
        else:
            year = 0
        guide_price = v[u"厂商指导价(元)"]
        #f=re.compile(r'(\d+.\d+)')
        #p=re.search(f,guide_price)
        #guide_price=p.group()
        emission_standard = v[u"环保标准"]
        structure = v[u"车身结构"]
        level = v[u"级别"]
        manufacturer = v[u"厂商"]
        json_text = json.dumps(v, encoding='utf-8', ensure_ascii=False)
        db = ConnectDB()
        n = db.select(table_name="spider_json", field="spaceid", value=spaceid)
        if n != 0:
            logger.info("spaceid: %s exists " % spaceid)
            continue
        db.insert(table_name=table_name,
                  spaceid=spaceid,
                  brand=brand,
                  series=series,
                  models=models,
                  guide_price=guide_price,
                  level=level,
                  emission_standard=emission_standard,
                  structure=structure,
                  status=status,
                  manufacturer=manufacturer,
                  year=year,
                  index=index,
                  json_text=json_text,
                  URL_=URL_)
        db.dbclose()
Beispiel #2
0
def SaveData(table_name="",brand="",series="",conf="",status="",URL_="",index=""):
    conf=json.loads(conf)
    for (k,v) in conf.items():
        spaceid = k
        models = v[u"车型名称"]
        if models == '-':
            continue
        mth=re.compile(r'(.*)(20\d\d)(.*)')
        y=re.search(mth,models)
        if y:
            year = int(y.group(2))
        else:
            year = 0
        guide_price =v[u"厂商指导价(元)"]
        #f=re.compile(r'(\d+.\d+)')
        #p=re.search(f,guide_price)
        #guide_price=p.group()
        emission_standard=v[u"环保标准"]
        structure=v[u"车身结构"]
        level=v[u"级别"]
        manufacturer=v[u"厂商"]
        json_text=json.dumps(v,encoding='utf-8', ensure_ascii=False)
        db=ConnectDB()
        n = db.select(table_name="spider_json",field="spaceid",value=spaceid)
        if n != 0:
            logger.info("spaceid: %s exists " %  spaceid )
            continue
        db.insert(table_name=table_name, 
                    spaceid=spaceid,
                    brand=brand,
                    series=series,
                    models=models,
                    guide_price=guide_price,
                    level=level,
                    emission_standard=emission_standard,
                    structure=structure,
                    status=status ,
                    manufacturer=manufacturer,
                    year=year,
                    index=index,
                    json_text=json_text,
                    URL_=URL_)
        db.dbclose()
Beispiel #3
0
def threadSpider(brand,url2):   #获取到brand
    fname = brand+".txt"
    path ="../file/"+fname
    fobj = open(path,'a+')
    fileList = fobj.read().splitlines()
    print fileList
    fobj.close()
    obj = GetObj(url2)
    html = obj.gethtml()
    coding = obj.getcodeing(html)
    soup = BeautifulSoup(html,"html5lib",from_encoding=coding)
    clearfix = re.compile(r'list clearfix')
    clearfix = soup.find_all("div",attrs={"class":clearfix})
    figure = clearfix[1].find_all("a")
    for item in figure:
        flow = item.text.strip()
        streak = re.sub(r'\([^\)]*\)',"",flow)    #获取到花纹
        logger.info("streak:" + streak)
        href = item.get("href")
        newUrl = url + href
        obj = GetObj(newUrl)
        html = obj.gethtml()
        coding = obj.getcodeing(html);
        soup = BeautifulSoup(html,"html5lib",from_encoding=coding)
        clearfix = re.compile(r'products clearfix')
        clearfix = soup.find("div",attrs={"class":clearfix})
        clearfix2 = re.compile(r'product clearfix')
        clearfix2 = clearfix.find_all("div",attrs={"class":clearfix2})
        for i in clearfix2:
            name = i.a.get("title")                     #获取到轮胎name
            print name
            logger.info("name:" + name)
            href = i.a.get("href")
            print href
            xx= href.split("/")
            xh = xx[2].split(".")
            tyreid = xh[0]
            if href not in fileList:
                fobj = open(path,'a+')
                print u'写入'+href
                fobj.write(href+'\n')
                fobj.flush()
                fobj.close()

                db=ConnectDB()
                n = db.select(table_name="tyreinfo",field="tyreID",value=tyreid)
                if n != 0:
                    logger.info("tyreID: %s exists " %  tyreid )
                    print tyreid + u"存在"
                    continue
                tyreUrl = url + href
                tyreObj = GetObj(tyreUrl)
                tyreHtml = tyreObj.gethtml()
                tyreSoup = BeautifulSoup(tyreHtml,"html5lib",from_encoding=coding)
                basic = re.compile(r'basic free')
                basic = tyreSoup.find("div",attrs={"class":basic})
                fl = re.compile(r'fl')
                fl = basic.find("span",attrs={"class":fl})


                standard = fl.text.strip()                  #获取到standard
                logger.info("standard:" + standard)

                dl = basic.find_all("dl")
                loaded = dl[4].dd.text.strip()            #获取到load
                #loaded = re.sub(r'\([^\)]*\)',"",loaded)
                logger.info("load:" + loaded)

                speed = dl[5].dd.text.strip()       #获取到speed
                #speed = re.sub(r'\([^\)]*\)',"",speed)
                logger.info("speed:"+speed)

                place = dl[6].dd.text.strip()
                logger.info("place:"+place)



                pi3c = re.compile(r'clearfix pi3c')
                pi3c = basic.find("div",attrs={"class":pi3c})
                pi3c = pi3c.find_all("em")

                wearproof = pi3c[0].text.strip()            #获取到wearproof
                #wearproof = ""
                logger.info("wearproof:"+wearproof)

                traction = pi3c[1].text.strip()             #获取到traction
                logger.info("traction:"+traction)

                highTemperature = pi3c[2].text.strip()      #获取到highTemperature
                logger.info("highTemperature:"+highTemperature)

                db.insert("tyreinfo",tyreid,brand,streak,name,standard,loaded,speed,wearproof,traction,highTemperature)
                db.dbclose()

            else:
                logger.info(u"跳过"+href)
                print(u"跳过"+href)
                continue
    logger.info("finish:" + brand)
Beispiel #4
0
def SaveData(table_name="",
             brand="",
             series="",
             conf="",
             status="",
             URL_="",
             index="",
             level="",
             manufacturer=""):
    conf = json.loads(conf)  #转化为python对象
    for (k, v) in conf.items():
        spaceid = k
        name = v["567"]
        x = re.compile(r'span>(.*?)<span')
        if name == '-':
            continue
        name1 = re.search(x, name)
        if hasattr(name1, "group"):
            name1 = name1.group(1)
        else:
            x = re.compile(r'</span>(.*?)$')
            name1 = re.search(x, name)
            if hasattr(name1, "group"):
                name1 = name1.group(1)
            else:
                x = re.compile(r'^(.*?)<span')
                name1 = re.search(x, name)
        print name1

        year = v["0"]
        x = re.compile(r'(</span>|>|^)(.*?)($|<|<span)')
        year = re.search(x, year)
        year = year.group(2)

        guide_price = v["219"]
        x = re.compile(
            r'(</span>|>|^)([1-9]\d*.\d*|0.\d*[1-9]\d*)($|<|</span>)')  #取小数
        temp = re.search(x, guide_price)  #在数组中搜索
        if hasattr(temp, "group"):
            guide_price = temp.group(2)
        else:
            guide_price = 0
        print guide_price

        structure = v["281"]
        structure = re.search(re.compile(r'(</span>|>|^)(.*?)($|<|<span)'),
                              structure).group(2)
        print structure

        emission_standard = v["1072"]
        emission_standard = re.search(
            re.compile(r'(</span>|>|^)(.*?)($|<|<span)'),
            emission_standard).group(2)
        print emission_standard
        json_text = json.dumps(v, encoding='utf-8', ensure_ascii=False)
        db = ConnectDB()
        n = db.select(table_name="spider_json", field="spaceid", value=spaceid)
        if n != 0:
            logger.info("spaceid: %s exists " % spaceid)
            continue
        db.insert(table_name=table_name,
                  spaceid=spaceid,
                  brand=brand,
                  series=series,
                  models=name1,
                  guide_price=guide_price,
                  level=level,
                  emission_standard=emission_standard,
                  structure=structure,
                  status=status,
                  manufacturer=manufacturer,
                  year=year,
                  index=index,
                  json_text="",
                  URL_=URL_)
        db.dbclose()