Example #1
0
def getObj(url, proxyIs=False, cook=None, encode='utf-8'):
    for i in range(5):
        try:
            if cook == None:
                if proxyIs == False:
                    html = requests.get(url, timeout=15)
                else:
                    html = util.build_proxy_request(url)
            else:
                if proxyIs == False:
                    html = requests.get(url,
                                        headers=getHeaders(cook),
                                        verify=False)
                else:
                    header = getHeaders(cook)
                    html = util.build_proxy_request(url, None, getHeaders,
                                                    None)
            html.encoding = encode
            break
        except HTTPError as e:
            time.sleep(10)
            print('再次请求')
            print('HTTPError')
            return None
    try:
        bsObj = BeautifulSoup(html.text, 'html.parser')
    except AttributeError as e:
        print('AttributeError')
        return None
    return bsObj
Example #2
0
def main2():
    root2='http://www.mafengwo.cn'
    f_error1=open('logtime.txt','a+')
    f_error2=open('logplace.txt','a+')
    f1=open('qd1.csv','r',encoding='gb18030',newline='')
    f=open('qd2.csv','a+',newline='',encoding='gb18030')
    csv_write=csv.writer(f)
    for info in csv.reader(line.replace('\0','') for line in f1):
        newUrl=root2+info[0]
        html=util.build_proxy_request(newUrl)
        obj = BeautifulSoup(html.text,'html.parser')
        place=set()
        for p in obj.find_all('a',class_='_j_anchor'):
            place.add(p.get_text().strip())
        if place==set():
            print(newUrl+'没有景点')
            f_error2.write(newUrl)
            continue
        try:
            time=obj.find('li',class_='time').get_text().split('/')[1]
            day=obj.find('li',class_='day').get_text().split('/')[1]
        except:
            # recordtime=obj.find('span',class_='time')
            print(newUrl+'没有时间')
            f_error1.write(newUrl)
            continue
        
        row=[newUrl,info[1],time,day,' '.join(place)]
        print(row)
        csv_write.writerow(row)
Example #3
0
def main3():
    # ```没有时间的处理
    # f_error3=open('logconfplace.txt','w+')
    # f_error4=open('notime.txt','w+')
    f1=open('logplace.txt','r',encoding='gb18030',newline='')
    f=open('qd2.csv','a+',newline='',encoding='gb18030')
    csv_write=csv.writer(f)
    text=f1.read()
    textlist=text.split('http')
    # print(textlist)
    print(len(textlist))
    for i in range(1,len(textlist)):       
        newUrl='http'+textlist[i]
        # newUrl='http://www.mafengwo.cn/i/1373402.html'
        # print(newUrl)
        html=util.build_proxy_request(newUrl)
        obj = BeautifulSoup(html.text,'html.parser')
        try:
            time=obj.find('li',class_='time').get_text().split('/')[1]
            day=obj.find('li',class_='day').get_text().split('/')[1]
        except:
            time=''
            day=''
        place=set()
        for p in obj.find_all('a',class_='_j_anchor'):
            place.add(p.get_text().strip())
        for p in obj.find_all('a',class_='_j_keyword_list'):
            place.add(p.get_text().strip())
        if place==set():   
            print(newUrl+'没有景点')
            # f_error3.write(newUrl)
            continue
        row=[newUrl,time,day,' '.join(place)]
        print(row)
        csv_write.writerow(row)
Example #4
0
def get_city_again():
    f_city_msg = open('city_msg.csv', 'a+', newline='', encoding='gb18030')
    csv_write = csv.writer(f_city_msg)
    f_error = open('city_error2.txt', 'w+')
    for line in open('city_error1.txt', 'r'):
        # print(line)
        try:
            newUrl = line
            city = re.search(r'word=(.+?)&', newUrl).group(1)
            req = util.build_proxy_request(newUrl)
            bsObj = BeautifulSoup(req.text, 'html.parser')
            div = bsObj.find('div', class_='ly-city')
            image_link = div.find(
                'div',
                class_='c-row-tile c-gap-bottom ly-city-info-position').find(
                    'img').attrs['src']
            try:
                suit_season = div.find(
                    'div', class_='c-line-clamp1 c-span6').get_text().replace(
                        '适宜季节:', '').replace('建议游玩:', '')
            except:
                suit_season = ''
            try:
                bdbk = div.find('span', class_='c-color').get_text()
            except:
                bdbk = div.find('div',
                                class_='c-color c-line-clamp3').get_text()
            row = [city, newUrl, image_link, suit_season, bdbk]
            print(row)
            csv_write.writerow(row)
        except:
            print(newUrl)
            f_error.write(newUrl + '\n')
Example #5
0
def get_city_mag():
    #城市信息链接
    urlroot1 = 'https://www.baidu.com/sf?openapi=1&dspName=iphone&from_sf=1&pd=city&ms=1&hide=1&apitn=tangram&top=%7B%22sfhs%22%3A2%7D&tfr=redis&resource_id=4324&word='
    urlroot2 = '&title=%E7%9B%AE%E7%9A%84%E5%9C%B0%E6%94%BB%E7%95%A5&city_name=&frsrcid=&frorder=&lid=&ext=%7B%22sf_tab_name%22%3A%22%E6%A6%82%E8%A7%88%22%7D&sa=sf_tab1'

    fRead = open('city.csv', 'r', encoding='gb18030')
    f_city_msg = open('city_msg.csv', 'w+', newline='', encoding='gb18030')
    f_error = open('city_error.txt', 'w+')
    csv_write = csv.writer(f_city_msg)
    csv_write.writerow(['城市', '链接', '缩略图链接', '适宜季节', '百度百科', '', ''])
    for line in csv.reader(fRead):
        try:
            newUrl = urlroot1 + line[0] + urlroot2
            req = util.build_proxy_request(newUrl)
            bsObj = BeautifulSoup(req.text, 'html.parser')
            div = bsObj.find('div', class_='ly-city')
            image_link = div.find(
                'div',
                class_='c-row-tile c-gap-bottom ly-city-info-position').find(
                    'img').attrs['src']
            suit_season = div.find(
                'div', class_='c-line-clamp1 c-span6').get_text().replace(
                    '适宜季节:', '')
            bdbk = div.find('span', class_='c-color').get_text()
            row = [line[0], newUrl, image_link, suit_season, bdbk]
            print(row)
            csv_write.writerow(row)
        except:
            print(newUrl)
            f_error.write(newUrl + '\n')
Example #6
0
def main1():
    f=open('qdtime.csv','w+',newline='',encoding='gb18030')
    csv_write=csv.writer(f)
    #708
    for i in range(0,720):
        newUrl=urlRoot+str(i)+'.html'
        # print('正在爬取第'+str(i)+'页...')
        for i in range(5):
            try:
                html=util.build_proxy_request(newUrl)
                obj = BeautifulSoup(html.text,'html.parser')
                lilist=obj.find('div',class_='post-list').find_all('li',class_='post-item clearfix')
                break
            except:
                # print('第'+str(i)+'次尝试')
                if i==4:
                    print('没找到'+newUrl)
                # time.sleep(5)

        # obj = BeautifulSoup(html.text,'html.parser')
        # lilist=obj.find('div',class_='post-list').find_all('li',class_='post-item clearfix')
        print(len(lilist))
        for li in lilist:
            a=li.find('h2').find_all('a')[-1]
            time=li.find('span',class_='comment-date').get_text().split(' ')[0]
            if 'href' not in  a.attrs:
                print('position error')
            else:
                wr=[a['href'],time]
                # print(wr)
                csv_write.writerow(wr)
Example #7
0
def getlink():

    f = open('ebay_toys.csv', 'w+', newline='', encoding='gb18030')
    csv_write = csv.writer(f)
    f1 = open('log.txt', 'w+')
    for i in range(1, 100):
        newUrl = url + str(i)
        for j in range(5):
            try:
                req = util.build_proxy_request(newUrl)
                bsObj = BeautifulSoup(req.text, 'html.parser')
                lilist = bsObj.find(
                    'ul', class_='b-list__items_nofooter').find_all('li')
                break
            except:
                pass
            if j == 4:
                #该页取不到数据
                f1.writelines(str(i) + '\n')
                lilist = []

        for li in lilist:
            name = ''
            the_url = ''
            watch_sold = ''
            location = ''
            try:
                name = li.find('h3', class_='s-item__title').get_text()
                the_url = li.find('a', class_='s-item__link').attrs['href']
            except:
                pass
            try:
                watch_sold = li.find('span', class_='NEGATIVE').get_text()
            except:
                pass
            try:
                location = li.find(
                    'span',
                    class_='s-item__location s-item__itemLocation').get_text()
            except:
                pass
            row = [name, the_url, watch_sold, location]
            print(row)
            csv_write.writerow(row)
Example #8
0
def getAttractions():
    urlroot3 = 'https://www.baidu.com/sf?openapi=1&dspName=iphone&from_sf=1&pd=city&ms=1&hide=1&apitn=tangram&top=%7B%22sfhs%22%3A2%7D&tfr=redis&resource_id=4336&word='
    urlroot4 = '&title=%E7%9B%AE%E7%9A%84%E5%9C%B0%E6%94%BB%E7%95%A5&city_name=&frsrcid=&frorder=&lid=&ext=%7B%22sf_tab_name%22%3A%22%E6%99%AF%E7%82%B9%22%7D&sa=sf_tab1'
    fRead = open('city.csv', 'r', encoding='gb18030')
    f_attractions_msg = open('attractions_msg_first.csv',
                             'w+',
                             newline='',
                             encoding='gb18030')
    f_error = open('attractions_get_error.txt', 'w+')
    csv_write = csv.writer(f_attractions_msg)
    csv_write.writerow(
        ['城市', '城市链接', '景点', '景点链接', '缩略图链接', '评分', '简介', '', ''])
    for line in csv.reader(fRead):
        rowlist = []
        try:
            newUrl = urlroot3 + line[0] + urlroot4
            req = util.build_proxy_request(newUrl)
            bsObj = BeautifulSoup(req.text, 'html.parser')
            divlist = bsObj.find(
                'div', class_='sfc-ly-scene-list-wrap').find_all(
                    'div',
                    class_='sfc-ly-scene-list-item c-container WA_LOG_SF')
            for div in divlist:
                name = div.find(
                    'span', class_='sfc-ly-scene-list-item-name').get_text()
                link = div.attrs['data-href']
                img_link = div.find(
                    'div', class_='c-img c-img-z').find('img').attrs['src']
                score = div.find(
                    'span', class_='sfc-ly-scene-list-item-score').get_text()
                introduction = div.find(
                    'p', class_='c-line-clamp2 sfc-ly-scene-list-item-desc'
                ).get_text()
                row = [
                    line[0], newUrl, name, link, img_link, score, introduction
                ]
                rowlist.append(row)
            for r in rowlist:
                csv_write.writerow(r)
            print(line[0] + '已写入')
        except:
            print(newUrl)
            f_error.write(newUrl + '\n')
Example #9
0
def getObj(url, proxyIs=False, header=None, encode='utf-8'):
    for i in range(5):
        try:

            if proxyIs == False:
                html = requests.get(url, headers=header)
            else:
                html = util.build_proxy_request(url)
            html.encoding = encode
            break
        except HTTPError as e:
            # time.sleep(3)
            print('再次请求')
            print('HTTPError')
            return None
    try:
        bsObj = BeautifulSoup(html.text, 'html.parser')
    except AttributeError as e:
        print('AttributeError')
        return None
    return bsObj
Example #10
0
def crawl(url):
    f3 = open('logrwid.txt', 'w+')
    review = []
    for j in range(5):
        try:
            req = util.build_proxy_request(url)
            bsObj = BeautifulSoup(req.text, 'html.parser')
            # print(bsObj)
            body = bsObj.find('div', {'id': 'BottomPanelDF'})
            break
        except:
            pass
        if j == 4:
            #该页取不到数据
            f3.writelines(url + '\n')
    try:
        divlist = body.find('div', {
            'id': 'rwid'
        }).find_all('div', class_=' ebay-review-section')
    except:
        return []

    for div in divlist:
        title = ''
        content = ''
        try:
            title = div.find(
                'p', class_='review-item-title wrap-spaces').get_text()
        except:
            pass
        try:
            content = div.find(
                'p', class_='review-item-content wrap-spaces').get_text()
        except:
            pass
        review.append(title + ':::' + content)
    return review