Ejemplo n.º 1
0
def get_urls(url, file_name):
    spider = jd_spider.getSpider()
    html_data = spider.get_html(url)
    page_num = int(html_analysis.get_page_count(html_data))
    out_file = open(file_name, "w", encoding='utf-8')
    for i in range(1, page_num + 1):
        html_data = spider.get_html(url + "&page=" + str(i))
        url_list = html_analysis.get_items_url(html_data)
        print('page:%s' % i)
        print(url_list)
        for i in url_list:
            out_file.write(i + '\n')
    out_file.close()
    del_duplicate(file_name)
Ejemplo n.º 2
0
def crawl_rates(file_path):
    unsolved_file = file_path + 'procedure_files/unsolved_skus.txt'
    solved_file = file_path + 'procedure_files/solved_skus.txt'
    del_solved_item(unsolved_file, solved_file)

    in_file = open(file_path + 'procedure_files/unsolved_skus.txt',
                   "r",
                   encoding='utf-8')
    count = 1
    for each_line in in_file:
        sku = each_line.strip("\n")
        if len(sku) <= 0:
            break
        print(count)
        count += 1
        spider = jd_spider.getSpider()
        spider.get_rate(file_path, sku)
    in_file.close()
Ejemplo n.º 3
0
def get_shop_info(sku,shop_list):
    try:
        url = 'https://item.m.jd.com/product/' + sku + '.html'
        spider = jd_spider.getSpider()
        html_data = spider.get_html(url)
        soup = bs(html_data, 'html.parser')
        div = soup.find_all('script')
        found = 0
        shop_id = ''
        for i in div:
            temp = str(i)
            if found==1:
                break
            index = temp.find('shopId')
            if index>=0:
                for j in range(index,index+20):
                    if temp[j]==',':
                        index2 = j
                        found = 1
                        break
                shop_id = temp[index+8:index2]
                if shop_id in shop_list:
                    return shop_list[shop_id]
        if len(shop_id)==0:
            return -1
        url = 'https://shop.m.jd.com/?shopId='+shop_id
        html_data = spider.get_html(url)
        soup = bs(html_data, 'html.parser')
        div = soup.find('div', class_='cell shop-info')
        shop_name = div.find('span',class_='ui-flex shop-name').find('em').get_text()
        follow_num = div.find('span',class_='ui-flex shop-other').find('em').get_text()
        count = float(re.findall(r"\d+\.?\d*",follow_num)[0])
        if follow_num.find('万')>0:
            count = count*10000
        count = int(count)
        return shop_id,shop_name,str(count),sku
    except Exception as e:
        print('get shop_info fail,err:'+str(e))
        return -1
Ejemplo n.º 4
0
def get_some_comments(file_path):
    del_solved_item(file_path + 'procedure_files/unsolved_urls.txt',
                    file_path + 'procedure_files/solved_urls.txt')
    in_file = open(file_path + 'procedure_files/unsolved_urls.txt',
                   "r",
                   encoding='utf-8')
    for each_line in in_file:
        url = each_line.strip("\n")
        if len(url) <= 0:
            break
        spider = jd_spider.getSpider()
        html_data = spider.get_html(url)  #获取商品详情页面的html数据
        if html_data == -1:
            continue
        number = html_analysis.get_number(html_data)
        if number == -1:
            continue
        comment = spider.get_comments(file_path, str(number))
        file = open(file_path + 'procedure_files/solved_urls.txt',
                    "a",
                    encoding='utf-8')
        file.write(url + '\n')  # 把已经处理了的数据写进文件里面去
        file.close()
    in_file.close()