Esempio n. 1
0
def GetFirstType(url):
    obj =  GetObj(url)
    html = obj.gethtml()
    coding = obj.getcodeing(html)
    soup = BeautifulSoup(html,"html5lib",from_encoding=coding)
    m=re.compile(r"navcar")
    content=soup.find_all("li",attrs={"class":m})
    url1={}
    for item in content:
        name=item.a.text
        if name == u"电动车":
            continue
        href=item.a.get("href")
        url1[name]=href
    return url1
Esempio n. 2
0
def GetFirstType(url):
    obj = GetObj(url)
    html = obj.gethtml()
    coding = obj.getcodeing(html)
    soup = BeautifulSoup(html, "html5lib", from_encoding=coding)
    m = re.compile(r"navcar")
    content = soup.find_all("li", attrs={"class": m})
    url1 = {}
    for item in content:
        name = item.a.text
        if name == u"电动车":
            continue
        href = item.a.get("href")
        url1[name] = href
    return url1
Esempio n. 3
0
def GetFirstType(url):
    obj = GetObj(url)  #得到一个爬虫对象
    html = obj.gethtml()  #得到请求的页面内容
    coding = obj.getcodeing(html)  #得到编码类型
    soup = BeautifulSoup(
        html, "html5lib",
        from_encoding=coding)  #将html内容打开,用html5lib做解析器,用coding进行编码
    m = re.compile(r"navcar")  #python通过re模块提供对正则表达式的支持,查找“navcar”字段的字符
    content = soup.find_all("li", attrs={"class":
                                         m})  #搜索文档树, 找到<li>标签并且class = navcar
    url1 = {}
    for item in content:  #循环
        name = item.a.text  #获取子节点<a>的内容
        if name == u"电动车":  #排除掉电动车   中文前加u就是告诉python后面的是个unicode编码,存储时按unicode格式存储。
            continue
        href = item.a.get("href")  #获取链接
        href = "http:" + href
        url1[name] = href  #存储到数组中
    return url1
Esempio n. 4
0
def firstGetHtml():
    obj = GetObj(url);
    html = obj.gethtml();
    coding = obj.getcodeing(html);
    soup = BeautifulSoup(html,"html5lib",from_encoding=coding)
    list = re.compile(r'i4')
    list = soup.find("div",attrs={"id":list})
    li = list.find_all("li")
    for item in li:
        if(item.a.get("href").strip() != '//'):         #排除掉空的url
            href = url + item.a.get("href")
            brand = item.a.text.strip()
            print brand+u"开始爬取"
            logger.info(brand + u" 开始爬取")
            t = threading.Thread(target=threadSpider,args=(brand,href))
            t.start()
            while True:
                if(len(threading.enumerate()) < THARED_NUMBER + 1 ):      #threading.enumerate(): 返回一个包含正在运行的线程的list。正在运行指线程启动后、结束前,不包括启动前和终止后的线程。  这里限制线程数不大于6个
                    break
    return
Esempio n. 5
0
def GetFirstTypeAika(url):
    obj = GetObj(url)  #得到一个爬虫对象
    html = obj.gethtml()  #得到请求的页面内容
    coding = obj.getcodeing(html)  #得到编码类型
    soup = BeautifulSoup(
        html, "html5lib",
        from_encoding=coding)  #将html内容打开,用html5lib做解析器,用coding进行编码
    container = re.compile(r"container")
    content = soup.find_all("div", attrs={"class": container
                                          })  #搜索文档树, 找到<li>标签并且class = navcar
    for item in content:
        first_latter = item.div.text.strip()
        print first_latter
        logger.info(u"字母" + first_latter + " start!...")
        if (first_latter >= 'B'):
            t = threading.Thread(target=giveFirstLatter,
                                 args=(first_latter, item))
            t.start()
            while True:
                if (
                        len(threading.enumerate()) < THARED_NUMBER + 1
                ):  #threading.enumerate(): 返回一个包含正在运行的线程的list。正在运行指线程启动后、结束前,不包括启动前和终止后的线程。  这里限制线程数不大于6个
                    break
    return
Esempio n. 6
0
def giveFirstLatter(first_latter, item):
    fname = first_latter + ".txt"
    path = 'D:/pyLearning/spider-master/spider-master/spider/tmp/' + fname  #添加类型的记录path   需修改
    fobj = open(path, 'r+')
    fileList = fobj.read().splitlines()
    fobj.close()
    column_tit = re.compile(r'column_tit')
    column_tit = item.find_all("div", attrs={"class": column_tit})
    width = re.compile(r'848px')
    width = item.find_all("td", attrs={"width": width})
    for (brand, width) in zip(column_tit, width):
        print "--" + brand.a.span.text.strip()
        brands = brand.a.span.text.strip()
        logger.info(u"--品牌" + brand.a.span.text + " start!...")
        item_list = re.compile(r'item_list')
        item_list = width.find_all("div", attrs={"class": item_list})
        for i in item_list:
            print "---" + i.a.get("href")
            text = i.a.get("href").strip()
            if text not in fileList:  #如果没在文本中找到的话,插入href 并写入数据库
                logger.info(u"---车型" + i.a.get("href") + " start!...")
                href = url2 + i.a.get("href") + "config.htm"
                logger.info(u"---链接地址 " + href)
                obj = GetObj(href)
                html = obj.gethtml()
                while (True):
                    if not html is None:
                        coding = obj.getcodeing(html)  #获取编码类型
                        soup = BeautifulSoup(html,
                                             'html5lib',
                                             from_encoding=coding)
                        base_title = re.compile(r'base_title')
                        base_title = soup.find_all("tr",
                                                   attrs={"id": base_title})
                        soup2 = base_title[0]  #找到base_title的DOM
                        col = re.compile(r'col')
                        col = soup2.find_all("td", attrs={"scope": col})  #找到td
                        for i in col:
                            model = i.a.text.strip()  #获取到model
                            logger.info("model " + model)
                            modid = i.get("id")
                            mod = re.compile(r'(mod_)(.*)')
                            carid = re.search(mod, modid)
                            if hasattr(carid, 'group'):
                                carid = carid.group(2)
                                string = "bname_" + carid
                                db = ConnectDB()
                                n = db.select(table_name="carInfo1",
                                              field="vechiclesID",
                                              value=carid)
                                if n != 0:
                                    logger.info("vechiclesID: %s exists " %
                                                carid)
                                    continue
                                series = re.compile(string)
                                series = soup.find("td", attrs={"id": series})
                                if not series is None:  #获取到series
                                    series = series.a.text.strip()
                                    logger.info("series " + string)
                                else:
                                    logger.error(string + "not found!!!!")
                                    series = "-"

                                string = "type_name_" + carid  #获取到carType
                                carType = re.compile(string)
                                carType = soup.find("td",
                                                    attrs={"id": carType})
                                if not carType is None:
                                    carType = carType.a.text.strip()
                                    logger.info("carType " + carType)
                                else:
                                    logger.error(string + "not found!!!!")
                                    series = "-"

                                string = "m_newseat_" + carid  #获取到peopleNum
                                peopleNum = re.compile(string)
                                peopleNum = soup.find("td",
                                                      attrs={"id": peopleNum})
                                if not peopleNum is None:
                                    peopleNum = peopleNum.text.strip()
                                    logger.info("peopleNum " + peopleNum)
                                else:
                                    logger.error(string + "not found!!!!")
                                    peopleNum = "-"

                                string = "syear_" + carid  #获取到marketTime
                                marketTime = re.compile(string)
                                marketTime = soup.find(
                                    "td", attrs={"id": marketTime})
                                if not marketTime is None:
                                    marketTime = marketTime.text.strip()
                                    logger.info("marketTime " + marketTime)
                                else:
                                    logger.error(string + "not found!!!!")
                                    marketTime = "-"

                                string = "m_disl_working_mpower_" + carid  #获取到engine
                                engine = re.compile(string)
                                engine = soup.find("td", attrs={"id": engine})
                                if not engine is None:
                                    engine = engine.text.strip()
                                    logger.info("engine " + engine)
                                else:
                                    logger.error(string + "not found!!!")
                                    engine = "-"

                                string = "m_mdisl_" + carid
                                displacement = re.compile(string)
                                displacement = soup.find(
                                    "td", attrs={"id": displacement})
                                if not displacement is None:
                                    displacement = displacement.text.strip()
                                    logger.info("displacement " + displacement)
                                else:
                                    logger.error(string + "not found!!!")
                                    displacement = "-"

                                db.insertTyre2("carInfo1", carid, brands,
                                               series, carType, peopleNum,
                                               marketTime, engine,
                                               displacement, first_latter,
                                               model)
                                db.dbclose()
                            else:
                                logger.error(modid + u" 该处无法获得汽车id!")
                                break
                        break
                    else:
                        time.sleep(360)
                        html = obj.gethtml()
                fobj = open(path, 'a+')
                print u'写入' + first_latter + ' ' + text
                fobj.write(text + '\n')
                fobj.flush()
                fobj.close()
            else:
                logger.info(u"跳过" + i.a.get("href"))
                print u"跳过" + i.a.get("href")
                continue  #否则进行下一条判断
    print first_latter + u"已完成!!!"
    logger.info(first_latter + u"已完成!!!")
Esempio n. 7
0
def threadSpider(brand,url2):   #获取到brand
    fname = brand+".txt"
    path ="../file/"+fname
    fobj = open(path,'a+')
    fileList = fobj.read().splitlines()
    print fileList
    fobj.close()
    obj = GetObj(url2)
    html = obj.gethtml()
    coding = obj.getcodeing(html)
    soup = BeautifulSoup(html,"html5lib",from_encoding=coding)
    clearfix = re.compile(r'list clearfix')
    clearfix = soup.find_all("div",attrs={"class":clearfix})
    figure = clearfix[1].find_all("a")
    for item in figure:
        flow = item.text.strip()
        streak = re.sub(r'\([^\)]*\)',"",flow)    #获取到花纹
        logger.info("streak:" + streak)
        href = item.get("href")
        newUrl = url + href
        obj = GetObj(newUrl)
        html = obj.gethtml()
        coding = obj.getcodeing(html);
        soup = BeautifulSoup(html,"html5lib",from_encoding=coding)
        clearfix = re.compile(r'products clearfix')
        clearfix = soup.find("div",attrs={"class":clearfix})
        clearfix2 = re.compile(r'product clearfix')
        clearfix2 = clearfix.find_all("div",attrs={"class":clearfix2})
        for i in clearfix2:
            name = i.a.get("title")                     #获取到轮胎name
            print name
            logger.info("name:" + name)
            href = i.a.get("href")
            print href
            xx= href.split("/")
            xh = xx[2].split(".")
            tyreid = xh[0]
            if href not in fileList:
                fobj = open(path,'a+')
                print u'写入'+href
                fobj.write(href+'\n')
                fobj.flush()
                fobj.close()

                db=ConnectDB()
                n = db.select(table_name="tyreinfo",field="tyreID",value=tyreid)
                if n != 0:
                    logger.info("tyreID: %s exists " %  tyreid )
                    print tyreid + u"存在"
                    continue
                tyreUrl = url + href
                tyreObj = GetObj(tyreUrl)
                tyreHtml = tyreObj.gethtml()
                tyreSoup = BeautifulSoup(tyreHtml,"html5lib",from_encoding=coding)
                basic = re.compile(r'basic free')
                basic = tyreSoup.find("div",attrs={"class":basic})
                fl = re.compile(r'fl')
                fl = basic.find("span",attrs={"class":fl})


                standard = fl.text.strip()                  #获取到standard
                logger.info("standard:" + standard)

                dl = basic.find_all("dl")
                loaded = dl[4].dd.text.strip()            #获取到load
                #loaded = re.sub(r'\([^\)]*\)',"",loaded)
                logger.info("load:" + loaded)

                speed = dl[5].dd.text.strip()       #获取到speed
                #speed = re.sub(r'\([^\)]*\)',"",speed)
                logger.info("speed:"+speed)

                place = dl[6].dd.text.strip()
                logger.info("place:"+place)



                pi3c = re.compile(r'clearfix pi3c')
                pi3c = basic.find("div",attrs={"class":pi3c})
                pi3c = pi3c.find_all("em")

                wearproof = pi3c[0].text.strip()            #获取到wearproof
                #wearproof = ""
                logger.info("wearproof:"+wearproof)

                traction = pi3c[1].text.strip()             #获取到traction
                logger.info("traction:"+traction)

                highTemperature = pi3c[2].text.strip()      #获取到highTemperature
                logger.info("highTemperature:"+highTemperature)

                db.insert("tyreinfo",tyreid,brand,streak,name,standard,loaded,speed,wearproof,traction,highTemperature)
                db.dbclose()

            else:
                logger.info(u"跳过"+href)
                print(u"跳过"+href)
                continue
    logger.info("finish:" + brand)
Esempio n. 8
0
def thrad(type_name, url2):
    #logger.info("name:%s url: %s" % (type_name,url2))
    url2 = url2.encode("utf-8")  #用utf-8编码
    obj = GetObj(url2)  #得到一个爬虫对象
    html = obj.gethtml()  #获取页面
    coding = obj.getcodeing(html)  #获取编码类型
    soup = BeautifulSoup(html, 'html5lib', from_encoding=coding)

    #print "----------------------------------------------"
    #print type_name
    #print "----------------------------------------------"
    logger.info("start %s...." % type_name)
    content = soup.find("div",
                        attrs={"class": ["tab-content-item",
                                         "current"]})  #find返回的不是列表是文本
    soup = BeautifulSoup(str(content), 'html5lib')  #再返回一个soup对象
    index = soup.find_all('span', attrs={'class': "font-letter"})  #找到字典顺序
    box = soup.find_all(
        'div', attrs={'class': ["uibox-con", "rank-list", "rank-list-pic"]})
    for (index, box) in zip(index, box):
        #for item in box:
        #获取字母分割的DIV 同时获取字母索引
        index = index.text.strip()  #默认删除空白符
        brand_soup = BeautifulSoup(str(box), 'html5lib')  #返回一个soup对象
        brand_html = brand_soup.find_all('dl')
        for brand_item in brand_html:
            #品牌名称
            brand = brand_item.dt.text.strip()  #品牌
            series_html = brand_item.dd
            series_soup = BeautifulSoup(str(series_html),
                                        'html5lib')  #根据<dd>标签找到子目录的soup
            manufacturer_name = series_soup.find_all('div',
                                                     attrs={"class":
                                                            "h3-tit"})  #品牌名称
            ul = series_soup.find_all('ul', attrs={"class": "rank-list-ul"})
            for (manufacturer, ul_tag) in zip(manufacturer_name, ul):
                #获取厂商名称
                manufacturer = manufacturer.text
                logger.info("start %s...." % manufacturer)
                logger.debug(ul_tag)
                soup = BeautifulSoup(str(ul_tag), 'html5lib')
                w = re.compile(r's\d+')
                litag = soup.find_all('li', id=w)
                for item in litag:
                    #获取车系名称
                    series = item.h4.text
                    db = ConnectDB()  #建立数据库连接
                    n = db.select(table_name="carinfo",
                                  field="series",
                                  value=series)  #查询
                    db.dbclose()  #关闭连接
                    if n != 0:
                        logger.info("%s %s %s exists " %
                                    (type_name, brand, series))  #如果找到,说明存在该条记录
                        continue
                    href = item.h4.a.get("href")  #如果没找到,则取得他的链接地址
                    price = item.div.text  #记录价格
                    url_id = href.split("/")[3]  #记录url_id
                    #print "●●%s %s %s" % (series,price,url_id)
                    #拼接在售车辆的配置页面URL
                    sale_conf_url = "http://car.autohome.com.cn/config/series/%s.html" % url_id
                    #拼接停售车辆的配置页面URL
                    stop_sale_conf_url = "http://www.autohome.com.cn/%s/sale.html" % url_id
                    url_dic = {
                        "sale_conf_url": sale_conf_url,
                        "stop_sale_conf_url": stop_sale_conf_url
                    }
                    #threads=[]
                    for (url_name, sale_url) in url_dic.items():
                        #在售
                        if url_name == "sale_conf_url":
                            status = u"在售"
                            #print sale_url
                            #def get_josn():
                            log_mess = "%s:%s %s %s %s %s %s %s" % (
                                status, type_name, index, brand, manufacturer,
                                series, price, url_id)
                            obj = GetObj(sale_url)
                            conf = obj.getconf()
                            if conf:
                                #print conf
                                logger.info(log_mess)
                                """SaveData(table_name="spider_json",    #存储到数据库
                                    brand=brand,
                                    series=series,
                                    conf=conf,
                                    status=status,
                                    index=index,
                                    URL_=sale_conf_url,
                                    level=type_name,
                                    manufacturer = manufacturer)"""
                                SaveDataToCarInfo("carinfo", brand, series,
                                                  type_name, conf, index)
                            else:
                                mess = u"没有找到相关配置"
                                logger.info("%s %s" % (log_mess, mess))
                                #print mess
                        else:

                            #停售
                            #def get_stop_conf():
                            status = u"停售"
                            obj = GetObj(sale_url)
                            html = obj.gethtml()
                            coding = obj.getcodeing(html)
                            soup = BeautifulSoup(html,
                                                 'html5lib',
                                                 from_encoding=coding)
                            filter_html = soup.find_all(
                                'div', attrs={"class": "models_nav"})
                            log_mess = "%s:%s %s %s %s %s %s %s" % (
                                status, type_name, index, brand, manufacturer,
                                series, price, url_id)
                            if filter_html:
                                for item in filter_html:
                                    href = item.find('a',
                                                     text=u'参数配置').get("href")
                                    stop_sale_conf_url_1 = url + href
                                    obj = GetObj(stop_sale_conf_url_1)
                                    conf = obj.getconf()
                                    if conf:
                                        #print conf
                                        logger.info("%s %s" % (log_mess, href))
                                        """SaveData(table_name="spider_json",
                                            brand=brand,
                                            series=series,
                                            conf=conf,
                                            status=status,
                                            index=index,
                                            level=type_name,
                                            URL_=stop_sale_conf_url_1)"""
                                        #print u"在售品牌中的停售车辆"
                                        SaveDataToCarInfo(
                                            "carinfo", brand, series,
                                            type_name, conf, index)
                                    else:
                                        mess = u"没有找到相关配置"
                                        logger.info("%s %s %s" %
                                                    (log_mess, mess, href))
                                        #print mess
                            else:
                                mess = u"没有找到相关配置"
                                logger.info("%s %s" % (log_mess, mess))
Esempio n. 9
0
#小型车可以
#微型车可以
#紧凑型车可以
#中型车 的人数
#大型车可以
#SUV
#MPV
#

reload(sys)
sys.setdefaultencoding('utf8')
url = "http://newcar.xcar.com.cn/2365/config.htm"
obj = GetObj(url)
html = obj.gethtml()
c = re.compile(r'(var specIDs =)(\[.*\])')
coding = obj.getcodeing(html)  #得到编码类型
soup = BeautifulSoup(html, "html5lib", from_encoding=coding)
print soup
temp = re.search(c, html)
if hasattr(temp, 'group'):
    temp = temp.group(2)
else:
    driver = webdriver.Chrome(
        executable_path=
        r"C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
    #    driver = webdriver.Firefox()
    #     打开网页
    driver.get(url)
ss = json.loads(temp)
data = {}
data["spaceid"] = ss[0]
Esempio n. 10
0
def thrad(type_name,url2):
    logger.info("name:%s url: %s" % (type_name,url2))
    url2=url2.encode("utf-8")
    obj = GetObj(url2)
    html=obj.gethtml()
    coding=obj.getcodeing(html)
    soup=BeautifulSoup(html,'html5lib',from_encoding=coding)
    

    #print "----------------------------------------------"
    #print type_name
    #print "----------------------------------------------"
    logger.info("start %s...." % type_name)
    content=soup.find("div",attrs={"class":["tab-content-item","current"]})
    soup=BeautifulSoup(str(content),'html5lib')
    index = soup.find_all('span',attrs={'class':"font-letter"})
    box =  soup.find_all('div',attrs={'class':["uibox-con", "rank-list","rank-list-pic"]})
    for (index,box) in zip(index,box):
    #for item in box:
        #获取字母分割的DIV 同时获取字母索引
        index = index.text.strip()
        brand_soup  = BeautifulSoup(str(box),'html5lib')
        brand_html=brand_soup.find_all('dl')
        for brand_item in brand_html:
            #品牌名称
            brand  = brand_item.dt.text.strip()
            series_html = brand_item.dd
            series_soup=BeautifulSoup(str(series_html),'html5lib')
            manufacturer_name=series_soup.find_all('div',attrs={"class":"h3-tit"})
            ul=series_soup.find_all('ul',attrs={"class":"rank-list-ul"})
            for (manufacturer,ul_tag) in zip(manufacturer_name,ul):
                #获取厂商名称
                manufacturer=manufacturer.text
                logger.info("start %s...." % manufacturer )
                logger.debug(ul_tag)
                soup=BeautifulSoup(str(ul_tag),'html5lib')
                w=re.compile(r's\d+')
                litag=soup.find_all('li',id=w)
                for item in litag:
                    #获取车系名称
                    series=item.h4.text
                    db=ConnectDB()
                    n=db.select(table_name="spider_json",field="series",value=series)
                    db.dbclose()
                    if n != 0:
                        logger.info("%s %s %s exists " % (type_name,brand, series) )
                        continue
                    href=item.h4.a.get("href")
                    price=item.div.text
                    url_id=href.split("/")[3]
                    #print "●●%s %s %s" % (series,price,url_id)
                    #拼接在售车辆的配置页面URL
                    sale_conf_url="http://car.autohome.com.cn/config/series/%s.html" % url_id
                    #拼接停售车辆的配置页面URL
                    stop_sale_conf_url="http://www.autohome.com.cn/%s/sale.html" % url_id
                    url_dic={"sale_conf_url":sale_conf_url,"stop_sale_conf_url":stop_sale_conf_url}
                    #threads=[]
                    for (url_name,sale_url) in url_dic.items():
                        #在售
                        if url_name == "sale_conf_url":
                            status=u"在售"
                            #print sale_url
                            #def get_josn():
                            log_mess="%s:%s %s %s %s %s %s %s" % (status,type_name,index,brand,manufacturer,series,price,url_id)
                            obj=GetObj(sale_url)
                            conf=obj.getconf()
                            if conf:
                                #print conf
                                logger.info(log_mess)
                                SaveData(table_name="spider_json",
                                    brand=brand,
                                    series=series,
                                    conf=conf,
                                    status=status,
                                    index=index,
                                    URL_=sale_conf_url)
                                
                            else:
                                mess= u"没有找到相关配置"
                                logger.info("%s %s" % (log_mess,mess))
                                #print mess
                        else:
                
                            #停售
                            #def get_stop_conf():
                            status=u"停售"
                            obj=GetObj(sale_url)
                            html=obj.gethtml()
                            coding=obj.getcodeing(html)
                            soup=BeautifulSoup(html,'html5lib',from_encoding=coding)
                            filter_html=soup.find_all('div',attrs={"class":"models_nav"})
                            log_mess="%s:%s %s %s %s %s %s %s" % (status,type_name,index,brand,manufacturer,series,price,url_id)
                            if filter_html:
                                for item in filter_html:
                                    href=item.find('a',text=u'参数配置').get("href")
                                    stop_sale_conf_url_1=url+href
                                    obj=GetObj(stop_sale_conf_url_1)
                                    conf=obj.getconf()
                                    if conf:
                                        #print conf
                                        logger.info("%s %s" % (log_mess,href))
                                        SaveData(table_name="spider_json",
                                            brand=brand,
                                            series=series,
                                            conf=conf,
                                            status=status,
                                            index=index,
                                            URL_=stop_sale_conf_url_1)
                                        #print u"在售品牌中的停售车辆"
                
                                    else:
                                        mess= u"没有找到相关配置"
                                        logger.info("%s %s %s" % (log_mess,mess,href))
                                        #print mess
                            else:
                                mess= u"没有找到相关配置"
                                logger.info("%s %s" % (log_mess,mess))