コード例 #1
0
def main5():
    '获取个人信息页'
    f5 = open('personal_info.csv', 'w+', newline='', encoding='gb18030')
    csv_write5 = csv.writer(f5)
    # f6=open('personal_info.txt','w+',encoding='utf-8')
    for i in range(1, 13):
        print(i)
        url = 'http://zupu.syshenshi.com/ZR.aspx?pageindex=' + str(
            i
        ) + '&infoname=&mobile=&infotype=&infotype_name=&industry=&address=&iswaiqian=0&fuqin=&muqin=&peiou=&zinv=&muyuandizhi=&waiqiandizhi=&chushengriqi_begin=&chushengriqi_end=&qushiriqi_begin=&qushiriqi_end=&waiqian_begin=&waiqian_end='
        try:
            bsObj = mytemp.getObj(url)
            trlist = bsObj.find('table',
                                class_='stdtable').find('tbody').find_all('tr')
        except:
            time.sleep(10)
            try:
                bsObj = mytemp.getObj(url)
                trlist = bsObj.find(
                    'table', class_='stdtable').find('tbody').find_all('tr')
            except:
                time.sleep(10)
                bsObj = mytemp.getObj(url)
                trlist = bsObj.find(
                    'table', class_='stdtable').find('tbody').find_all('tr')
        for tr in trlist:
            tdlist = tr.find_all('td')
            row = []
            for td in tdlist:
                if td.find('a') != None:
                    row.append(td.find('a').attrs['href'])
                row.append(td.get_text())
            # f6.write(str(row)+'\n')
            csv_write5.writerow(row)
コード例 #2
0
ファイル: zhongchou.py プロジェクト: braimp/pySpider-2
def getLink():
    f1 = open(filename + '.csv', 'w+', newline='', encoding='gb18030')
    csv_write = csv.writer(f1)
    for i in range(1, 7):
        newurl = url + str(i)
        bsObj = mytemp.getObj(newurl).find('div',
                                           class_='sousuoListBox clearfix')
        divlist = bsObj.find_all('div', class_='ssCardItem')
        print(len(divlist))
        # return
        for div in divlist:
            a = div.find('a', class_='siteCardICH3')
            title = a.attrs['title'].replace(',', '_')
            href = a.attrs['href']
            keylist = div.find(
                'div', class_='siteCardFLabelBox siteIlB_box').find_all('a')
            keySum = len(keylist)
            haveGet = div.find('div',
                               class_='ftDiv').find('p',
                                                    class_='ftP').get_text()
            support = div.find('div',
                               class_='scDiv').find('p',
                                                    class_='ftP').get_text()
            jindu = div.find('div',
                             class_='thDiv').find('p',
                                                  class_='ftP').get_text()
            row = [filename, title, href, keySum, haveGet, support, jindu]
            print(row)
            csv_write.writerow(row)
コード例 #3
0
ファイル: toy_detail.py プロジェクト: braimp/pySpider-2
def get_detail(line):
    bs_obj=mytemp.getObj(line[1],True)
    img_link=bs_obj.find('div',class_='tab-pane').find('img').attrs['src']
    weight=''
    try:
        weight_list=bs_obj.find('div',class_='attributes-item mod-info kuajing-attribues').find_all('span')
        for w in weight_list:
            weight_content=w.get_text()
            if weight_content.find('产品净重')!=-1:
                weight=weight_content.replace('产品净重','').replace('\n','')
                break 
    except:
        csv_write1.writerow(line)
        print(line)
          

    #寻找规格
    guige=''
    td_list=bs_obj.find('div',{'id':'mod-detail-attributes'}).find_all('td')
    for m in range(0,len(td_list)):
        # print(td_list[m].get_text())
        if td_list[m].get_text().find('规格')!=-1:
            guige=td_list[m+1].get_text()
            break
    row=line+[img_link,weight,guige]
    #价格和数量

    price_list=bs_obj.find('div',{'id':'mod-detail-price'})
    price_td_list=price_list.find('tr',class_='price').find_all('td')
    amount_td_list=price_list.find('tr',class_='amount').find_all('td')
    for n in range(1,len(price_td_list)):
        row.append(price_td_list[n].get_text()+' '+amount_td_list[n].get_text())

    print(row)
    csv_write.writerow(row)
コード例 #4
0
ファイル: dzdpJb.py プロジェクト: braimp/pySpider-2
def get_detail():
    
    f2=open('wedding_final.csv','a+',encoding='gb18030',newline='')
    csv_write2=csv.writer(f2)
    f3=open('wedding_error1.csv','w+',encoding='gb18030',newline='')
    csv_write3=csv.writer(f3)
    for line in open('wedding_error.csv','r',encoding='gb18030'):
        
        line=line.split(',')
        if line[0][0]=='"':
            continue       
        # print(line)
        url=str('https://m'+line[1][11:]).strip().replace('\n','')
        print(url)
        # time.sleep(3)
        bsObj=mytemp.getObj(url,False,cook)
        # print(bsObj)
        # print(bsObj.find('body',class_='shop-details'))
        # jo=bsObj.find('textarea',{'id':'shop-detail'}).get_text()
        # print(jo)
        # break
        try:
            address=bsObj.find('div',class_='J_address').get_text().strip()
            phone=bsObj.find('div',class_='J_phone').get_text().strip().replace(' ','')+'\t'
            # print('daoda')
        except:
            csv_write3.writerow(line)
            print(line)
            continue
        row=line+[address,phone]
        print(row)
        csv_write2.writerow(row)
コード例 #5
0
def main3(ty):
    #个人信息页写入
    f_detail = open('P_detail' + str(ty) + '.txt',
                    'w+',
                    newline='',
                    encoding='utf-8')
    # f3=open('82.txt','w+',enc)
    for line in open('p_name' + str(ty) + '.txt', 'r', encoding='utf-8'):
        line = eval(line)
        # print(line)
        if line[0] == '0':
            continue
        # break
        url = 'http://zupu.syshenshi.com/zuren_detail.aspx?id=' + line[0]
        print(url)
        try:
            bsObj = mytemp.getObj(url)
            trlist = bsObj.find('table',
                                class_='stdtable').find('tbody').find_all('tr')
        except:
            time.sleep(20)
            try:
                bsObj = mytemp.getObj(url)
                trlist = bsObj.find(
                    'table', class_='stdtable').find('tbody').find_all('tr')
            except:
                time.sleep(20)
                bsObj = mytemp.getObj(url)
                trlist = bsObj.find(
                    'table', class_='stdtable').find('tbody').find_all('tr')
        row = line + []
        row1 = []
        for tr in trlist:
            tdlist3 = tr.find_all('td', class_='center')
            for td in tdlist3:
                row.append(td.get_text().replace(" ", ''))
            tdlist = tr.find_all('td', class_='left')
            for td in tdlist:
                row.append(td.get_text().replace(" ", ''))
        if len(row) == 31:
            row = row[:26] + ['', ''] + row[26:]
        f_detail.write(str(row) + '\n')
    f_detail.close()
コード例 #6
0
def main7(id, ty, page):
    line = []
    for line in open('./page' + str(page) + '/p_name' + str(ty) + '.txt',
                     'r',
                     encoding='utf-8'):
        line = eval(line)
        if line[0] == id:
            print('right')
            break
    if line[0] != id:
        raise id
    print(line[0])
    url = 'http://zupu.syshenshi.com/zuren_detail.aspx?id=' + line[0]
    print(url)
    try:
        bsObj = mytemp.getObj(url)
        trlist = bsObj.find('table',
                            class_='stdtable').find('tbody').find_all('tr')
    except:
        time.sleep(20)
        try:
            bsObj = mytemp.getObj(url)
            trlist = bsObj.find('table',
                                class_='stdtable').find('tbody').find_all('tr')
        except:
            time.sleep(20)
            bsObj = mytemp.getObj(url)
            trlist = bsObj.find('table',
                                class_='stdtable').find('tbody').find_all('tr')

    row = line + []
    for tr in trlist:
        tdlist3 = tr.find_all('td', class_='center')
        for td in tdlist3:
            row.append(td.get_text().replace(" ", ''))
        tdlist = tr.find_all('td', class_='left')
        for td in tdlist:
            row.append(td.get_text().replace(" ", ''))
    if len(row) == 31:
        row = row[:26] + ['', ''] + row[26:]
    print(row)
    return row
コード例 #7
0
def get_detail():
    f2 = open('wh_xyk_final.csv', 'w+', encoding='gb18030', newline='')
    csv_write2 = csv.writer(f2)
    for line in csv.reader(open('wh_xyk_link.csv', 'r', encoding='gb18030')):
        url = 'http://wh.bqqm.com' + line[1]
        bsObj = mytemp.getObj(url)
        phone = bsObj.find('div', class_='telli').find('span',
                                                       class_='p').get_text()
        row = [line[0], url, phone]
        print(row)
        csv_write2.writerow(row)
コード例 #8
0
ファイル: zhongchou.py プロジェクト: braimp/pySpider-2
def getDetail(line):
    url = line[2]
    bsObj = mytemp.getObj(url)
    if bsObj == None:
        print(url)
        f3.write('url' + '\n')
        return
    try:
        target = bsObj.find(
            'div', class_='xqRatioText clearfix').find('b').get_text()
    except:
        print(url)

        f3.write('url' + '\n')
        return

    concern = bsObj.find(
        'div', class_='xqDetailLeft siteImgBox').find('a').get_text()
    refresh = bsObj.find('li', {
        'data-scrollto': 'zxjzBox'
    }).find('b').get_text()
    comment = bsObj.find('li', {
        'data-scrollto': 'plOuterBox'
    }).find('b').get_text()
    supportTime = bsObj.find('li', {
        'data-scrollto': 'zczOuterBox'
    }).find('b').get_text()
    try:
        footlist = bsObj.find('div', class_='zcjeOuterBox').find_all(
            'div', class_='zcjeFooter')
    except:
        print(url)
        f3.write('url' + '\n')
        return
    l = len(footlist)
    reTime = footlist[l - 1].find_all('b')
    reTime = reTime[len(reTime) - 1].get_text()
    if bsObj.find('div', {'id': 'xmxqBox'}).find('img') != None:
        pic = '有'
    else:
        pic = '无'
    if bsObj.find('div', class_='play-box') != None:
        video = '有'
    else:
        video = '无'

    row = line + [
        target, concern, refresh, comment, supportTime, reTime, pic, video
    ]
    print(row)
    csv_write2.writerow(row)
コード例 #9
0
ファイル: dzdpJb.py プロジェクト: braimp/pySpider-2
def get_link():
    f1=open('dzdp_wedding.csv','w+',encoding='gb18030',newline='')
    csv_write1=csv.writer(f1)
    for i in range(1, 51):
        url = urlroot1 + str(i) + urlroot2
        bsObj=mytemp.getObj(url,False,cook)
        lilist=bsObj.find_all('div',class_='txt')
        print(len(lilist))
        for li in lilist:
            a=li.find('div',class_='tit').find('a')
            href=a.attrs['href']
            title=a.find('h4').get_text()
            row=[title,href]
            print(row)
            csv_write1.writerow(row)
コード例 #10
0
ファイル: bj.py プロジェクト: braimp/pySpider-2
def getcity():
    f2 = open('city.csv', 'w+', encoding='gb18030', newline='')
    csv_write2 = csv.writer(f2)
    url = 'http://www.to8to.com/index.html'
    bsObj = mytemp.getObj(url, False, cook)
    divbox = bsObj.find('div', {'id': 'city_box'})
    citybox = divbox.find_all('div', class_='cs_zs')
    for box in citybox:
        print(box)
        citylist = box.find('div', class_='xzcs_dt').find_all('a')
        for city in citylist:
            href = city.attrs['href']
            cityname = city.get_text().strip()
            row = [cityname, href]
            print(row)
            csv_write2.writerow(row)
コード例 #11
0
ファイル: tyn.py プロジェクト: braimp/pySpider-2
def getDetail(url):
    url='https://www.enf.com.cn'+url
    print(url)
    bsObj=mytemp.getObj(url,True)
    div=bsObj.find('div',class_='enf-company-profile-info-main pull-left')
    h1=div.find('h1',class_='blue-title').get_text().replace('\n','').strip()
    try:
        email=div.find('td',itemprop='email').find('a').get_text()
    except:
        email=''
    href=div.find('a',itemprop='url').attrs['title']
    tablelist=div.find('div',class_='enf-company-profile-info-main-spec position-relative').find_all('table')
    l=len(tablelist)
    address=tablelist[l-1].find_all('td')[1].get_text()
    row=[url,h1,email,href,address]
    csv_write.writerow(row)
    print(row)
コード例 #12
0
ファイル: qingdao.py プロジェクト: braimp/pySpider-2
def main1():
    f=open('qd1.csv','w+',newline='',encoding='gb18030')
    csv_write=csv.writer(f)
    #708
    for i in range(0,708):
        newUrl=urlRoot+str(i)+'.html'
        print('正在爬取第'+str(i)+'页...')
        lilist=mytemp.getObj(newUrl,cookie).find('div',class_='post-list').find_all('li',class_='post-item clearfix')
        print(len(lilist))
        for li in lilist:
            a=li.find('h2').find_all('a')[-1]
            a1=li.find('span',class_='author').find_all('a')[-1]
            if 'href' not in  a.attrs:
                print('position error')
            else:
                wr=[a['href'],a.get_text(),a1['href'],a1.get_text()]
                # print(wr)
                csv_write.writerow(wr)
コード例 #13
0
ファイル: fsop.py プロジェクト: braimp/pySpider-2
def main1():
    f2 = open('up_info.csv', 'a+', encoding='gb18030', newline='')
    csv_write2 = csv.writer(f2)

    for line in open('fsop_link.txt', 'r', encoding='utf-8'):
        line = eval(line)
        print(line[3])
        url = 'http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?method=loadEnterpriseDetail&enterpriseId=' + line[
            3]
        bsObj = mytemp.getObj(url)
        # print(bsObj)
        t = re.search(r'\$.trim((.+?))}', str(bsObj)).group(1)
        # print(t)
        # break
        divlist = bsObj.find('div', 'am-u-md-10').find_all(
            'div', class_='am-g am-margin-top')
        address = divlist[2].find(
            'div', class_='am-u-sm-8 am-u-md-8 am-u-end').get_text().replace(
                '\t', '').replace('\n', '').replace('\xa0',
                                                    '').replace('\r',
                                                                '').strip()
        expirated = divlist[3].find(
            'div', class_='am-u-sm-8 am-u-md-8 am-u-end').get_text().replace(
                '\t', '').replace('\n', '').replace('\r',
                                                    '').strip().split(' ')[0]
        scan = divlist[4].find('div', class_='am-u-sm-8 am-u-md-8 am-u-end')
        Scannedname = scan.get_text().replace('\t',
                                              '').replace('\n', '').replace(
                                                  '\r', '').strip()
        try:
            Scannedurl = 'http://fsop.caac.gov.cn' + scan.find(
                'a').attrs['href']
        except:
            Scannedurl = ''
        Limited = divlist[5].find(
            'div', class_='am-u-sm-8 am-u-md-8 am-u-end').get_text().replace(
                '\t', '').replace('\n', '').replace('\r', '').strip()
        row = [url, t] + line[:3] + [
            address, expirated, Scannedname, Scannedurl, Limited
        ]
        # print(row)
        csv_write2.writerow(row)
コード例 #14
0
ファイル: bj.py プロジェクト: braimp/pySpider-2
def getdetail():
    f1 = open('company.csv', 'w+', encoding='gb18030', newline='')
    csv_write1 = csv.writer(f1)
    for line in csv.reader(open('city.csv', 'r', encoding='gb18030')):
        time.sleep(3)
        city_name = line[0]
        print(city_name)
        if city_name == '昆明':
            continue
        url_root1 = line[1] + 'company/list_'
        i = 1
        while True:
            print(i)
            url = url_root1 + str(i) + '.html'
            bsObj = mytemp.getObj(url, False, cook)
            div1 = bsObj.find('div', class_='xgt_meitu_searchNone')
            if div1 != None:
                break

            div = bsObj.find('div', class_='default__company__list')
            try:
                lilist = div.find_all('li', class_='company-data ')
            except:

                print([city_name, i])
                break
            # print(len(lilist))
            for li in lilist:
                # print(li)
                href = li.find('a').attrs['href']
                phone = ''
                try:
                    phone = li.find(
                        'p', class_='company__phone').get_text().strip()
                except:
                    pass
                name = li.find('p', class_='company__name').find(
                    'span', class_='name').get_text().strip()
                row = [city_name, name, phone, href]
                print(row)
                csv_write1.writerow(row)
            i = i + 1
コード例 #15
0
def main1():
    #族谱总链接
    f1 = open('zupu.txt', 'w+', encoding='utf-8')
    for i in range(1, 100):
        url = 'http://zupu.syshenshi.com/Index.aspx?pageindex=' + str(
            i) + '&infoname='
        bsObj = mytemp.getObj(url)
        trlist = bsObj.find('table',
                            class_='stdtable').find('tbody').find_all('tr')
        le = len(trlist)
        print(le)
        if le == 0:
            break
        for tr in trlist:
            td = tr.find('td')
            name = td.get_text()
            href = td.find('a').attrs['href']
            row = [name, href]
            print(row)
            f1.write(str(row) + '\n')
コード例 #16
0
def get_link():
    f1 = open('company_link.csv', 'w+', encoding='gb18030', newline='')
    csv_write = csv.writer(f1)
    for i in range(1, 51):
        if i == 1:
            url = 'https://www.atobo.com.cn/Companys/s-p26-s579/'
        else:
            url = 'https://www.atobo.com.cn/Companys/s-p26-s579-y' + str(
                i) + '/'

        bsObj = mytemp.getObj(url, False, cook)
        lilist = bsObj.find('div',
                            class_='product_contextlist bplist').find_all(
                                'li', class_='product_box')
        for li in lilist:
            attrS = li.find('li', class_='pp_name').find('a').attrs
            title = attrS['title']
            link = attrS['href']
            row = [title, link]
            print(row)
            csv_write.writerow(row)
コード例 #17
0
ファイル: jiangshi.py プロジェクト: braimp/pySpider-2
from bs4 import BeautifulSoup
import requests
import csv
import sys
from urllib.error import HTTPError
sys.path.append("..")
import mytemp

url = 'http://www.jiangshi.org/search/kw_NULL_order_1_costmin_0_costmax_0_area_0_page_1.html'

mytemp.getObj(url, )
コード例 #18
0
            attrS = li.find('li', class_='pp_name').find('a').attrs
            title = attrS['title']
            link = attrS['href']
            row = [title, link]
            print(row)
            csv_write.writerow(row)


f2 = open('company_final.csv', 'a+', encoding='gb18030', newline='')
csv_write2 = csv.writer(f2)
f3 = open('company_error1.csv', 'w+', encoding='gb18030', newline='')
csv_write3 = csv.writer(f3)
for line in csv.reader(open('company_error.csv', 'r', encoding='gb18030')):
    url = line[1]
    print(url)
    bsObj = mytemp.getObj(url, True, cook).find('div', class_='card-context')
    try:
        ullist = bsObj.find_all('ul')
    except:
        time.sleep(2)
        bsObj = mytemp.getObj(url, True, cook).find('div',
                                                    class_='card-context')
        try:
            ullist = bsObj.find_all('ul')
        except:
            csv_write3.writerow(line)
            continue
    name = ''
    tele = ''
    for ul in ullist:
        text = ul.get_text()
コード例 #19
0
ファイル: toy.py プロジェクト: braimp/pySpider-2
from urllib.error import HTTPError
sys.path.append("..")
import mytemp

# lt=['https://jinbaohong.1688.com/page/offerlist.htm?spm=a2615.7691456.newlist.98.7feb58b2CWNKTL&showType=windows&tradenumFilter=false&sampleFilter=false&sellerRecommendFilter=false&videoFilter=false&mixFilter=false&privateFilter=false&mobileOfferFilter=%24mobileOfferFilter&groupFilter=false&sortType=wangpu_score&pageNum=',]
# url_root1='https://dmqxtoy.1688.com/page/offerlist_96911367.htm?spm=a2615.7691456.newlist.101.29ea157aMi6TDa&tradenumFilter=false&sampleFilter=false&sellerRecommendFilter=false&videoFilter=false&mixFilter=false&privateFilter=false&mobileOfferFilter=%24mobileOfferFilter&groupFilter=false&sortType=wangpu_score&pageNum='
# url_root1='https://dmqxtoy1480648284896.1688.com/page/offerlist_90101145.htm?tradenumFilter=false&sampleFilter=false&sellerRecommendFilter=false&videoFilter=false&mixFilter=false&privateFilter=false&mobileOfferFilter=%24mobileOfferFilter&groupFilter=false&sortType=wangpu_score&pageNum='
url_root1 = 'https://dmqxtoy.1688.com/page/offerlist.htm?spm=a2615.7691456.newlist.167.67e35eeaiIPXBE&tradenumFilter=false&sampleFilter=false&sellerRecommendFilter=false&videoFilter=false&mixFilter=false&privateFilter=false&mobileOfferFilter=%24mobileOfferFilter&groupFilter=false&sortType=wangpu_score&pageNum='
cook = '_csrf_token=1537943057961; cna=h9/yEVZzcBsCAXrNB8OEFnAP; JSESSIONID=t2xYdgd-GQSaYYwNgVts1vi3uB-8KTUj4R-xUw9; cookie2=13979f139a3eb43841d37d6bc89a466b; hng=CN%7Czh-CN%7CCNY%7C156; t=8464a4877acc365c142612d5c3535d73; _tb_token_=e76e553eeabb5; __cn_logon__=false; _tmp_ck_0=8DDX%2FdG7Qp4rD7%2BxIUDnt9IQQps0gXg0HxVorQnE1U9h1%2BJEDnSwTlDtAa6rnpNYZgyqZ04NLkFhoSZa62xDsmigmkh2C96Zwc3vUnWDAhJ43V6s20UdSFlrW8%2FNWLYvHlj66I9JrtQip0ZR%2BRGLKKCEOXBffuleBNW%2FlS0fLs8OnUFoyVr41%2ByFFe5lUUpWm0PqHiBd%2BiTQ%2BKt8xZ%2B19oW%2B3O9I5Fgowiwd5ki0VANNbNmhmFrqb%2Fjp9ubGsMV6YsxqeJspp5sJ2OoGdhC1HaX3ei%2BzZp7K%2F2ZEmTBCHj6Lr%2Frb4LupMxN1ER%2FNrE1Rayz%2Bf5KJZR%2FhvH7p5zFUPNE1n6nNZD6KoWyAUEbhjPQYWxDmAE23JAQ0E6hEqETG; UM_distinctid=166148de0821ae-0ff23d8e76a24-b353461-100200-166148de08414f; alicnweb=touch_tb_at%3D1537948661631; isg=BEBALzn_D6g1YfOsTPGJT6ajEc7YuyZOwdqqALrREdvqNeBfYt1bIxZPSN1Qptxr'

f1 = open('dmqxtoy_f.csv', 'w+', newline='', encoding='gb18030')
csv_write = csv.writer(f1)
page_error = []
for i in range(1, 10000):
    print(i)
    bs_obj = mytemp.getObj(url_root1 + str(i), True, cook)

    common = bs_obj.find('div', class_='common-column-230')
    if common == None:
        try:
            common = bs_obj.find('div', class_='no-content').get_text()
            print(page_error)
            break
        except:
            page_error.append(i)
    li_list = common.find('ul', class_='offer-list-row').find_all(
        'li', class_='offer-list-row-offer')
    for li in li_list:
        attrS = li.find('a', class_='title-link').attrs
        title = attrS['title']
        link = attrS['href']
コード例 #20
0
ファイル: attractions.py プロジェクト: braimp/pySpider-2
def get_gps():
    fRead = open('attractions_msg_final.csv', 'r', encoding='gb18030')
    f_attractions_msg = open('attractions_msg_final1.csv',
                             'a+',
                             newline='',
                             encoding='gb18030')
    csv_write = csv.writer(f_attractions_msg)
    headrow = [
        '城市', '城市链接', '景点', '景点链接', '缩略图链接', '评分', '简介', '百度百科', '开放时间',
        '门票信息', '游玩时长', '最佳季节', '官网', '电话', '地址', '交通', 'gps_链接'
    ]
    # csv_write.writerow(headrow)
    for line in csv.reader(fRead):
        for i in range(8, 12):
            t = line[i]
            if t != '':
                line[i] = t[:int(len(t) / 2)]

        if line[14] != '':
            continue
        line = line[:16] + ['']
        newUrl = line[3]
        print(newUrl)
        for i in range(3):
            bsObj = mytemp.getObj(newUrl)
            # req=util.build_proxy_request(newUrl)
            # bsObj=BeautifulSoup(req.text,'html.parser')
            div = bsObj.find('div', class_='c-list')
            if div != None:
                break
        if div == None:
            csv_write.writerow(line)
            continue
        divlist2 = div.find_all('div', class_='c-gap-inner-top c-flexbox')
        for d in divlist2:
            dlist = d.find_all('div')
            title = dlist[0].get_text().strip()
            if title != '电话':
                continue
            try:
                text = d.find(
                    'div', class_='c-span12').find('a').attrs['href'].replace(
                        'Tel:', '')
            except:
                text = d.find('div', class_='c-span12').get_text().strip()
            line[13] = '_' + text
        try:
            gps_link = bsObj.find(
                'div',
                class_='c-img c-img-item c-img-v').find('img').attrs['src']

        except:
            gps_link = ''
            print('gps_link_error')
        try:
            address = bsObj.find('div', class_='map-address c-gap-top').find(
                'p', class_='map-address-text').get_text()

        except:
            address = ''
            print('address_error')
        try:
            traffic = bsObj.find('div', class_='map-traffic c-gap-top').find(
                'div',
                class_='map-traffic-container').get_text().replace('交通',
                                                                   '').strip()

        except:
            traffic = ''
            print('traffic_error')
        if line[15] == '':
            line[15] = traffic
        line[14] = address
        line[16] = gps_link
        csv_write.writerow(line)
コード例 #21
0
ファイル: attractions.py プロジェクト: braimp/pySpider-2
def get_error_link1():
    # # names=['城市','城市链接','景点','景点链接','缩略图链接','评分','简介'],
    # data1=pd.read_csv('attractions_msg_first.csv',header=None,encoding='gb18030')
    # data2=pd.read_table('attractions_get_error_link1.txt',header=None,encoding='gb18030')
    # data=pd.merge(data1,data2,left_on=3,right_on=0)
    # # find('div',class_='c-color c-gap-bottom c-line-clamp3')
    # data.to_csv('error.csv', sep='*',encoding='gb18030')
    f_attractions_msg = open('attractions_msg_final.csv',
                             'a+',
                             newline='',
                             encoding='gb18030')
    csv_write = csv.writer(f_attractions_msg)
    headrow = [
        '城市', '城市链接', '景点', '景点链接', '缩略图链接', '评分', '简介', '百度百科', '开放时间',
        '门票信息', '游玩时长', '最佳季节', '官网', '电话', '地址'
    ]
    f_error3 = open('error3.csv', 'w+', encoding='gb18030', newline='')
    f_error2 = open('error4.txt', 'w+')
    csv.write1 = csv.writer(f_error3)

    data1 = pd.read_csv('attractions_msg_first.csv',
                        header=None,
                        encoding='gb18030')
    for attractions_link in open('attractions_get_error_link1.txt', 'r'):
        error = 0
        r = data1[data1[3] == attractions_link.strip()].values[0][:6]
        row = list(r) + ['' for i in range(20)]
        newUrl = attractions_link
        # newUrl='http://m.baidu.com/sf?openapi=1&dspName=iphone&from_sf=1&pd=jingdian_detail&resource_id=4616&word=%E9%9D%92%E6%B5%B7%E8%97%8F%E6%96%87%E5%8C%96%E9%A6%86&title=%E9%9D%92%E6%B5%B7%E8%97%8F%E6%96%87%E5%8C%96%E9%A6%86&lid=9159143487664320820&ms=1&frsrcid=31132&frorder=6'
        print(newUrl)
        for i in range(3):
            try:
                bsObj = mytemp.getObj(newUrl)
                # req=util.build_proxy_request(newUrl)
                # bsObj=BeautifulSoup(req.text,'html.parser')
                div = bsObj.find('div', class_='c-list')
                try:
                    bdbk = div.find(
                        'div',
                        class_='c-list-item c-line-bottom c-list-border'
                    ).get_text().replace('百度百科', '')
                except:
                    try:
                        bdbk = div.find(
                            'div', class_='c-color c-gap-bottom c-line-clamp3'
                        ).get_text().replace('百度百科', '')
                    except:
                        bdbk = ''
                break
            except:
                time.sleep(2)
                # pass

            if i == 4:
                print('error1')
                error = 1
                csv.write1.writerow(r)
        if error == 1:
            continue
        try:
            divlist1 = div.find_all(
                'div',
                class_=
                'c-gap-inner-top c-flexbox c-line-bottom c-gap-inner-bottom')
            divlist2 = div.find_all('div', class_='c-gap-inner-top c-flexbox')
            # print(divlist2)
            for d in divlist1:
                dlist = d.find_all('div')
                title = dlist[0].get_text().strip()
                text = d.find('div', class_='c-span12').get_text().strip()
                try:
                    t = headrow.index(title)
                    row[t] = text
                except:
                    headrow.append(title)
                    t = headrow.index(title)
                    row[t] = text
            for d in divlist2:
                dlist = d.find_all('div')
                title = dlist[0].get_text().strip()
                try:
                    text = d.find(
                        'div',
                        class_='c-span12').find('a').attrs['href'].replace(
                            'Tel:', '')
                except:
                    text = d.find('div', class_='c-span12').get_text().strip()
                try:
                    t = headrow.index(title)
                    # print(t)
                    row[t] = text
                except:
                    headrow.append(title)
                    print(headrow)
                    t = headrow.index(title)
                    row[t] = text
        except:
            print('error2')
            f_error2.write(r + '\n')
        # print(row)
        csv_write.writerow(row)
        # break
    print(headrow)
コード例 #22
0
ファイル: attractions.py プロジェクト: braimp/pySpider-2
def get_detail():
    fRead = open('attractions_msg_first.csv', 'r', encoding='gb18030')
    f_attractions_msg = open('attractions_msg_final.csv',
                             'a+',
                             newline='',
                             encoding='gb18030')
    csv_write = csv.writer(f_attractions_msg)
    headrow = [
        '城市', '城市链接', '景点', '景点链接', '缩略图链接', '评分', '简介', '百度百科', '开放时间',
        '门票信息', '游玩时长', '最佳季节', '官网', '电话', '地址'
    ]
    # csv_write.writerow(headrow)
    # f_error1=open('attractions_get_error_link1.csv','w+')
    # csv_write1=csv.writer(f_error1)
    # f_error2=open('attractions_get_error_link2.csv','w+')
    # csv_write1=csv.writer(f_error2)

    for line in csv.reader(fRead):
        error = 0
        line = line[:7]
        # print(line)
        row = line + ['' for i in range(14)]
        # try:
        newUrl = line[3]
        # newUrl='http://m.baidu.com/sf?openapi=1&dspName=iphone&from_sf=1&pd=jingdian_detail&resource_id=4616&word=%E9%9D%92%E6%B5%B7%E8%97%8F%E6%96%87%E5%8C%96%E9%A6%86&title=%E9%9D%92%E6%B5%B7%E8%97%8F%E6%96%87%E5%8C%96%E9%A6%86&lid=9159143487664320820&ms=1&frsrcid=31132&frorder=6'
        print(line[0], line[2])
        for i in range(3):
            bsObj = mytemp.getObj(newUrl)
            # req=util.build_proxy_request(newUrl)
            # bsObj=BeautifulSoup(req.text,'html.parser')
            div = bsObj.find('div', class_='c-list')
            if div != None:
                break
        if div == None:
            csv_write.writerow(line)
            continue
        try:
            bdbk = div.find('div',
                            class_='c-list-item c-line-bottom c-list-border'
                            ).get_text().replace('百度百科', '')
        except:
            try:
                bdbk = div.find('div',
                                class_='c-color c-gap-bottom c-line-clamp3'
                                ).get_text().replace('百度百科', '')
            except:
                bdbk = ''
        row[7] = bdbk

        divlist1 = div.find_all(
            'div',
            class_='c-gap-inner-top c-flexbox c-line-bottom c-gap-inner-bottom'
        )
        divlist2 = div.find_all('div', class_='c-gap-inner-top c-flexbox')
        for d in divlist1:
            dlist = d.find_all('div')
            title = dlist[0].get_text().strip()
            text = d.find('div', class_='c-span12').get_text().strip()
            try:
                t = headrow.index(title)
                row[t] = text
            except:
                headrow.append(title)
                t = headrow.index(title)
                row[t] = text
        for d in divlist2:
            dlist = d.find_all('div')
            title = dlist[0].get_text().strip()
            try:
                text = d.find(
                    'div', class_='c-span12').find('a').attrs['href'].replace(
                        'Tel:', '')
            except:
                text = d.find('div', class_='c-span12').get_text().strip()
            try:
                t = headrow.index(title)
                # print(t)
                row[t] = text
            except:
                headrow.append(title)
                print(headrow)
                t = headrow.index(title)
                row[t] = text
        csv_write.writerow(row)
        # break
    print(headrow)
コード例 #23
0
import requests
import csv
import time
import json
import re
import sys
from urllib.error import HTTPError
sys.path.append("..")
import mytemp

url='https://huggies.tmall.com/category-1038993307.htm?spm=a1z10.3-b-s.w4011-14533183956.292.787976ceiOvHAT&type=p&newHeader_b=s_from&searcy_type=item&from=.shop.pc_2_searchbutton&catId=1038993307&keyword=%BA%C3%C6%E6&pageNo=2&tsearch=y#anchor'
cook='cna=h9/yEVZzcBsCAXrNB8OEFnAP; hng=CN%7Czh-CN%7CCNY%7C156; _m_h5_tk=64705fff8ba6e9dd50853f5f5b9929cb_1539873398765; _m_h5_tk_enc=5301f3c7015e41bb1026840101c64f24; t=8464a4877acc365c142612d5c3535d73; _tb_token_=753361be35e5a; cookie2=1a3195a39b1ad24bf437640212c57659; pnm_cku822=; cq=ccp%3D1; isg=BFRUCI2p88yHkGdJey2xbUrjJZLMozARvabeTO41QF9Z2fUjFr4_J6Mf3ZFkIbDv'
header={
    'Cookie':cook,
# referer:https://detail.tmall.com/item.htm?spm=a1z10.3-b-s.w4011-14466283798.50.43547440H0DfBt&id=40545566432&rn=6fe19d846dcf80351798b26fa5bfdf95&abbucket=20&skuId=3854687709703'
}
bsObj=mytemp.getObj(url,False,cook)
print(bsObj)
# print(bsObj.find('div',class_='J_TItems'))