Example #1
0
def get_list(url):
    list_num =''
    html=get_html.get_html(url)
    list_info =re.search(r"<div class='result-summary-container'>(.*?)</div>",html,re.S)
    if list_info!=None:
        list_info=list_info.group(1)
        list_num=re.search(r'of[\s]([0-9]?\,?[0-9]+)[\s]results', list_info, re.S)
        if list_num:
            list_num = list_num.group(1)
            list_num = list_num.replace(',', '')
            return int(list_num)
        else:
            return 0
    else:
        list_info =re.search(r"<div class=result-summary-container>(.*?)</div>",html,re.S)
        if list_info!=None:
            list_info=list_info.group(1)
            list_num=re.search(r'of[\s]([0-9]?\,?[0-9]+)[\s]results', list_info, re.S)
            if list_num:
                list_num = list_num.group(1)
                list_num = list_num.replace(',', '')
                return int(list_num)
            else:
                return 0
        else:
            return 0
Example #2
0
def handle(list_url):
    try:
        url_info = list_url.split("\t")
        url = url_info[-1].replace("\n", "")
        list_num = url_info[0]
        #     http://www.walmart.com/browse/books/top-200-books/1229749_1044270?page=2&cat_id=1229749_1044270&facet=retailer:Walmart.com&min_price=0&max_price=5
        items_url_info = url.split("?")
        if int(list_num) > 1000:
            list_num = 1000
        if int(list_num) % 40 > 0:
            page_num = int(list_num) / 40 + 1
        else:
            page_num = int(list_num) / 40
        for page in range(1, page_num + 1):
            items_url = items_url_info[0] + '?page=' + str(
                page) + '&' + items_url_info[1]
            html = get_html.get_html(items_url)
            itemsId_info = re.search(r'"displayOrder":\[(.*?)\]', html, re.S)
            itemsId_all = itemsId_info.group(1).replace('"', '')
            items_list = itemsId_all.split(',')
            for itemsId in items_list:
                result_file.write(itemsId + '\n')
                result_file.flush()
                print(itemsId)
    except Exception as e:
        print(e)
        pass
def get_all_url(page_num, base_url):

    for i in range(1, page_num + 1):
        url = base_url
        url += str(i)
        print(url)
        html = get_html.get_html(url)
        if html:
            ff = open('./html/' + str(i), 'w')
            ff.write(html)
            print('write html succeed')
        else:
            print('write html error')
Example #4
0
def get_json(info):
    driver.implicitly_wait(8)
    try:
        info_url = 'http' + info.split('http')[1]
        basic_info = info.split('http')[0]
        driver.get(info_url)
        html = driver.page_source
        # html = get_html.get_html(url)
        formor_url_list = re.findall(
            r'<div id="mod-detail-dealrecord"(.*?)"isTgcSKUOffer"', html, re.S)
        formor = str(formor_url_list).split('"remarkListUrl":"')[1]
        formor_url = formor.split('","')[0]
        print(formor_url)
        memberId_list = re.findall(
            r'<input type="hidden" id="feedbackUid"(.*?)/>', html, re.S)
        memberId = str(memberId_list).split('value="')[1]
        memberId = memberId.split('"')[0]

        memberId_list_1 = re.findall(r'var WolfSmoke={(.*?)}', html, re.S)
        memberId_1 = str(memberId_list_1).split('member_id:"')[1]
        memberId_1 = memberId_1.split('"')[0]
        print(memberId_1)

        json_url = formor_url + '&currentPage=1&memberId=' + memberId
        print(json_url)
        data_prime = get_html.get_html(json_url)
        # req = urllib2.Request(json_url)
        # page = urllib2.urlopen(req, timeout=10)
        # data_prime = page.read()
        print(data_prime)
        finally_file = []
        offerSaleRecordStat = data_prime.split('offerSaleRecordStat":')[1]
        offerSaleRecordStat = offerSaleRecordStat.split(',"currentPage"')[0]
        #{"repeatBuyCount":6.5,"buyerTotal":123,"saleRecordTotal":594,"oneRecordRateCountHidden":40,"avgBuyCount":4,"oneRecordRate":6.73},"currentPage":1,"totalPage":29,"defaultShowWithContent":false}
        numlist = re.findall(r'":(.*?),"', str(offerSaleRecordStat), re.S)
        print(numlist)
        finally_file.append(basic_info)
        finally_file.append(numlist)
        lock.acquire()
        result_file.write("\t".join(finally_file) + "\n")
        result_file.flush()
        lock.release()
        driver.implicitly_wait(5)

        driver.close()
    except:
        pass
Example #5
0
def get_items_html(items_file):
    global lock,pool,items_info
    title_list = ['itemsId', 'price', 'ship', 'stock', 'brand', 'title', 'img1', 'img2', 'img3', 'detail1', 'detail2', 'detail3', 'detail4', 'detail5','Specification']
    file_name = './Hidden Cameras.xls'
    items_info = open(file_name, 'aw')
    items_info.write('\t'.join(title_list) + '\n')
    items_info.flush()

    items_list = open(items_file,'r').readlines()

    for it in items_list:
        print (it)
        html = get_html.get_html(it)
        # time.sleep(2.5)
        get_info(html)

    items_info.close()
Example #6
0
def handle(items):
    # sign = str(items).split('>')[0]
    url = 'https://www.amazom.com/dp/' + items
    print(url)
    try:
        #商品详情页
        #获取每一个商品页面的html
        html = get_html.get_html(url)

        if html:
            #调用get_info函数,传入html
            print(html)
        else:
            print('error')

    except Exception as e:
        with open('./Result/asin_is_exist/error.txt', 'a') as fail_url:
            fail_url.write(items + e + '\n')
Example #7
0
def get_bookurl_by_cateurl(url):
    try:
        html = get_html.get_html(url)
        if html and -1 == html.find("Security Measure"):
            pages = re.findall('<b>Page 1</b> of (.*?)<br>', html)
            if pages:
                pages = int(pages[0].replace(",", ""))
                book_urls = re.findall(
                    '<div style="float:left;" itemscope="itemscope" itemtype="http://schema.org/SearchResultsPage"><b><a href="(.*?)">',
                    html, re.S)
                for book_url in book_urls:
                    lock.acquire()
                    result_file.write(book_url + '\n')
                    result_file.flush()
                    lock.release()
            url = url + 'QQpgZ'
            for page in range(2, pages):
                bookurl = url + str(page)
                print(bookurl)
                while True:
                    html = get_html(bookurl)
                    if html:
                        real_page = re.findall('<b>Page (.*?)</b> of', html)
                        if real_page:
                            if str(page) == real_page[0]:
                                book_urls = re.findall(
                                    '<div style="float:left;" itemscope="itemscope" itemtype="http://schema.org/SearchResultsPage"><b><a href="(.*?)">',
                                    html, re.S)
                                for book_url in book_urls:
                                    lock.acquire()
                                    result_file.write(book_url + '\n')
                                    result_file.flush()
                                    lock.release()
                                break
                        elif -1 != html.find("Try again..."):
                            break
        else:
            lock.acquire()
            not_crawl_file.write(url + "\n")
            not_crawl_file.flush()
            lock.release()
    except Exception as e:
        print(e)
        logger2.error(str(e))
Example #8
0
def get_newegg_url(page_num, new_url):
    for i in range(1, page_num + 1):
        url = new_url
        url += str(i)
        print(url)
        html = get_html.get_html(url)

        result_file = open('./items_new.txt', 'aw')
        if html:
            # <input id="CompareItem_9SIA97A4TG0647" autocomplete="off" neg-itemnumber="9SIA97A4TG0647" type="checkbox"
            # name="CompareItem" value="CompareItem_9SIA97A4TG0647">
            # 取后面的字符串,以列表形式保存
            items = re.findall(r'value="CompareItem_(.*?)"', html)
            print(len(items))
            for item in items:
                # items.txt里保存的是截取的字段
                result_file.write(item + '\n')
        else:
            print('write html error')
Example #9
0
def get_result(itemsId, i):
    true = True
    null = None
    false = False
    key_list = [
        'asfas', '', 'z2pqv4dtuwhxe3hkesx9kqpv', 'fmwnnrwf53d6c5sw7b4pu2q3',
        'jqpyjz92jmaruene4mpbe8pc', 'nz2gzu5byp9dbnm6jee69jkp',
        'sfpw74s5yte8dj9r9atzyc5m', '2tk5sghn56mnth5uabspkdt6',
        'favxs8n4kvmrtebrn6uymjdy', 'vffn6p33smtby3tytugp8zjt',
        'ctztjz3gfm273husdu7apwwh', 'fwx3gf2qdqhx782h9espqajz',
        'f7fqv4jzcdr7ccfb2b339cv9', '7sd4rpjfmdurwuwzgvpbffd2'
    ]
    key = key_list[i]
    items_url = 'http://api.walmartlabs.com/v1/items/' + str(
        itemsId) + '?apiKey=' + key + '&format=json'
    print(items_url)
    info = get_html.get_html(items_url)
    items_info = eval(info)
    return items_info
Example #10
0
def get_asin(base_url, page):
    #Electronics : Computers & Accessories : Monitors : Prime Eligible : New
    for i in range(0, page):  # 页码
        start_num = i * 25
        url = base_url.replace("[page]",
                               str(i)).replace('[start]', str(start_num))
        print(url)
        # time.sleep(2)
        html = get_html.get_html(url)

        url_list_re = re.findall(r'<td colspan="2">(.*?)</td>', html, re.S)
        print(url_list_re)
        url_list = re.findall(r'<A HREF="(.*?)">', str(url_list_re), re.S)
        print(url_list)

        print(len(url_list))
        for goods_url in url_list:
            with open("./Result/items_url.txt", "aw") as f:
                f.write('http://www.frys.com/' + goods_url + "\n")
                print(goods_url)
Example #11
0
def handle(itemsId):
    try:
        #删除itemsId开头结尾处的空格
        itemsId = itemsId.strip()
        #商品详情页
        url = 'http://www.newegg.com/Product/Product.aspx?Item=' + itemsId
        #获取每一个商品页面的html
        html = get_html.get_html(url)

        if html:
            #调用get_info函数,传入html和每个商品的id
            get_info(html, itemsId)
        else:
            with open('./get_html_fail.txt', 'aw') as h:
                h.write(itemsId + '\n')

    except Exception as e:
        print(itemsId, ":", e)
        with open('./except.txt', 'aw') as f:
            f.write(itemsId + '\n')
Example #12
0
def get_asin():
    #Electronics : Computers & Accessories : Monitors : Prime Eligible : New

    base_url = '''https://www.amazon.com/s/ref=sr_pg_[i]?fst=as%3Aoff&rh=n%3A172282%2Cn%3A!493964%2Cn%3A172541%2Cn%3A12097478011%2Cp_85%3A2470955011%2Cp_n_condition-type%3A2224371011&page=[i]&bbn=12097478011&ie=UTF8&qid=1479085629'''
    for i in range(1, 222):  # 页码,共2页
        url = base_url.replace("[i]", str(i))
        print(url)
        time.sleep(2)
        html = get_html.get_html(url)

        url_list = re.findall(
            r'<a class="a-link-normal s-access-detail-page .*? href="(.*?)">',
            html, re.S)
        print(len(url_list))

        for goods_url in url_list:
            with open("./Result/items_url.txt", "aw") as f:
                f.write(goods_url + "\n")
                print(goods_url)
            items_asin = re.findall(r'/dp/(.*?)/ref', goods_url, re.S)
Example #13
0
def get_book_html(line):
    print('---------------------get_book_html begin----------------------')
    url = line.replace('\n', '')
    book_id = re.findall('http://product.half.ebay.com/.*?/(.*?)&.*?tg=info',
                         url)[0]
    html = get_html.get_html(url)
    if html:
        with open('./result/' + opt[0] + '/book/' + book_id + '.html',
                  'w') as ff:
            ff.write(html)
        print('success:', url)
    else:
        with open('./result/' + opt[0] + '/get_book_html_fail.txt',
                  'aw') as fff:
            fff.write(line)
        with open('./result/' + opt[0] + '/get_book_info_fail.txt',
                  'aw') as ffff:
            ffff.write(line.replace('\n', '') + '.html' + '\n')
        print('fail:', url)
    print('---------------------get_book_html end----------------------')
Example #14
0
def handle(itemsurl):
    try:
        #商品详情页
        #获取每一个商品页面的html
        # html = requests.get(itemsurl).text
        html = get_html.get_html(itemsurl)
        # 获取每一个商品的asin
        # print html

        if html:
            #调用get_info函数,传入html
            get_info(html)
            print(html)
        else:
            with open('./Result/get_html_fail.txt', 'aw') as h:
                h.write(itemsurl + '\n')

    except Exception as e:
        # print itemsurl, ":",  e
        with open('./Result/fail_url.txt', 'aw') as fail_url:
            fail_url.write(itemsurl + '\n')
def get_book_info(isbn):

    url = "http://search.half.ebay.com/" + isbn.replace(
        "\n", "") + "_W0QQmZbooksQQ_trksidZp2919Q2em1447Q2el2686"
    print(url)
    time.sleep(2.5)
    html = get_html.get_html(url)

    if html and -1 == html.find("No products found for"):
        if -1 == html.find("Security Measure"):
            try:
                isbn13 = ''
                isbn = re.findall(
                    r'ISBN-10:</b>\s+<span class=""><a href=.*? class="pdplinks">(.*?)</a>',
                    html, re.S)
                if isbn:
                    isbn = isbn[0]
                else:
                    isbn = ''
                isbn_13 = re.findall(
                    r'ISBN-13:</b>\s+<span class=""><a href=.*? class="pdplinks">(.*?)</a>',
                    html, re.S)
                if isbn_13:
                    isbn13 = isbn_13[0]
                weight = ""
                we = re.findall(
                    r'Weight:</td><td width="80%" valign="top">(.*?)</td>',
                    html, re.S)
                try:
                    auther_list = re.findall(
                        r'<span class="pdplinks">(.*?)</a>', html, re.S)
                    print(auther_list)
                    auther = str(auther_list).split('class="pdplinks">')[1]
                    print(auther)
                    auther = auther.split('<')[0]
                    auther.replace("']", '')
                except:
                    auther_list = re.findall(
                        r'<span class="pdplinks">(.*?)</span>', html, re.S)
                    auther = re.findall(r'>(.*?)</a>', str(auther_list), re.S)
                    auther = str(auther).replace("']", '')
                    # auther = str(auther)

                if we:
                    weight = we[0]
                if weight == "":  #重量为空值,过滤掉
                    delete_file.write(isbn13 + "\n")
                    delete_file.flush()
                else:
                    weight_float = float(weight.replace("oz", "").strip())
                    if weight_float > 60 or weight_float == 0:  #重量大于60或为0,过滤掉
                        delete_file.write(isbn13 + "\n")
                        delete_file.flush()
                    else:
                        results = re.findall(
                            '<h2 class="PDP_itemConditionTitle">(.*?)</h2>.*?<table cellpadding="0" cellspacing="0" class="PDP_itemList">(.*?)</table>',
                            html, re.S)
                        if results:
                            condition = {
                                "Brand New": "11",
                                "Like New": "1",
                                "Very Good": "2",
                                "Good": "3",
                                "Acceptable": "4"
                            }
                            for i in range(len(results)):
                                book_info = []
                                shelf_info = []  #上架信息

                                book_info.append(isbn)
                                book_info.append(isbn13)
                                book_info.append(weight)
                                book_info.append(auther)
                                book_info.append(results[i][0])

                                #上架SKU
                                sku = isbn + "_" + condition[results[i]
                                                             [0]] + "_O_MM"
                                shelf_info.append(sku)
                                shelf_info.append(isbn)
                                shelf_info.append("1")
                                #对店铺做排除
                                prices = re.findall(
                                    '<td><span class="PDP_itemPrice">(.*?)</span></td>.*?<td><a class="PDP_sellerName" href=".*?">(.*?)</a></td>',
                                    str(results[i]), re.S)
                                exclude_seller = [
                                    'alibris_books_01', 'alibris_books_02',
                                    'alibris_books_03', 'alibris_books_04',
                                    'alibris_books_05', 'alibris_books_06',
                                    'alibris_books_07', 'alibris_books_08',
                                    'alibris_books_09', 'alibris',
                                    'alibris_movies', 'labsbooks11'
                                ]
                                price_float = 0.0  #采购价格

                                if len(prices) == 1:  #condition下只有一个价格
                                    price_float = float(prices[0][0].replace(
                                        "$", "").replace(",", ""))
                                    if prices[0][1] not in exclude_seller:
                                        book_info.append(prices[0][0])
                                    else:
                                        break
                                elif len(prices) == 2:  #conditon下有两个价格
                                    count = 0
                                    for j in range(2):
                                        if prices[j][1] not in exclude_seller:
                                            book_info.append(prices[j][0])
                                            orig_price = prices[j][
                                                0]  #记录适合条件的最高价
                                            count += 1
                                    if count == 0:
                                        break
                                    else:
                                        #采购价格
                                        price_float = float(
                                            orig_price.replace("$",
                                                               "").replace(
                                                                   ",", ""))

                                else:
                                    count = 0
                                    for j in range(len(prices)):
                                        if prices[j][1] not in exclude_seller:
                                            book_info.append(prices[j][0])
                                            orig_price = prices[j][
                                                0]  #记录适合条件的最高价
                                            count += 1
                                        if count == 2:
                                            break
                                    if count == 0:
                                        break
                                    else:
                                        #采购价格
                                        price_float = float(
                                            orig_price.replace("$",
                                                               "").replace(
                                                                   ",", ""))

                                #上架价格
                                if price_float > 1:
                                    p1 = (price_float + 3.99) * 1.5
                                    p2 = price_float + 23.99

                                    price = str(p1 > p2 and p1 or p2)
                                    shelf_info.append(price)
                                    shelf_info.append(price)
                                    # shelf_info.append(auther)
                                    shelf_info.append("8888800")
                                    shelf_info.append(condition[results[i][0]])
                                    shelf_info.append("1")
                                    shelf_info.append("1")
                                    shelf_info.append("N")
                                    shelf_info.append("5")
                                else:
                                    lock.acquire()
                                    delete_file.write("\t".join(book_info) +
                                                      "\n")
                                    delete_file.flush()
                                    lock.release()
                                    break
        #                         #对店铺不做排除
        #                         prices=re.findall('<td><span class="PDP_itemPrice">(.*?)</span></td>.*?<td><span class="PDP_itemPrice">(.*?)</span></td>',str(results[i]),re.S)
        #                         if prices:
        #                             book_info.append(prices[0][0])
        #                             book_info.append(prices[0][1])
        #                         else:
        #                             prices=re.findall('<td><span class="PDP_itemPrice">(.*?)</span></td>',str(results[i]),re.S)
        #                             if prices:
        #                                 book_info.append(prices[0])
        #                                 book_info.append('')

                                lock.acquire()
                                result_file.write("\t".join(book_info) + "\n")
                                result_file.flush()
                                onshelf_file.write("\t".join(shelf_info) +
                                                   "\n")
                                onshelf_file.flush()
                                lock.release()
                            print("success:", isbn)
                            lock.acquire()
                            success_isbn_file.write(isbn13 + "\n")
                            success_isbn_file.flush()
                            lock.release()
                        else:
                            print("offshelf:", isbn)
                            lock.acquire()
                            offshelf_isbn_file.write(isbn13 + "\n")
                            offshelf_isbn_file.flush()
                            lock.release()
            except BaseException as e:
                print(isbn, e)
                logger2.exception(str(e))
        else:
            lock.acquire()
            not_crawl_file.write(isbn)
            not_crawl_file.flush()
            lock.release()
    else:
        lock.acquire()
        not_list_file.write(isbn)
        not_list_file.flush()
        lock.release()
Example #16
0
def get_book_isbn(url):
    try:
        time.sleep(2.5)
        html = get_html.get_html(url.replace("\n", ""))
        if html and -1 == html.find("Security Measure"):
            try:
                isbn13 = ''
                isbn = re.findall(
                    'ISBN-10:</b>\s+<span class=""><a href=.*? class="pdplinks">(.*?)</a>',
                    html)
                if isbn:
                    isbn = isbn[0]
                else:
                    isbn = ''
                isbn_13 = re.findall(
                    'ISBN-13:</b>\s+<span class=""><a href=.*? class="pdplinks">(.*?)</a>',
                    html)
                if isbn_13:
                    isbn13 = isbn_13[0]
                results = re.findall(
                    '<h2 class="PDP_itemConditionTitle">(.*?)</h2>.*?<table cellpadding="0" cellspacing="0" class="PDP_itemList">(.*?)</table>',
                    html, re.S)
                if results:
                    for i in range(len(results)):
                        book_info = []
                        book_info.append(isbn)
                        book_info.append(isbn13)
                        book_info.append(results[i][0])
                        #对店铺做排除
                        prices = re.findall(
                            '<td><span class="PDP_itemPrice">(.*?)</span></td>.*?<td><a class="PDP_sellerName" href=".*?">(.*?)</a></td>',
                            str(results[i]), re.S)
                        exclude_seller = [
                            'alibris_books_01', 'alibris_books_02',
                            'alibris_books_03', 'alibris_books_04',
                            'alibris_books_05', 'alibris_books_06',
                            'alibris_books_07', 'alibris_books_08',
                            'alibris_books_09', 'alibris', 'alibris_movies'
                        ]
                        if len(prices) == 1:
                            if prices[0][1] not in exclude_seller:
                                book_info.append(prices[0][0])
                        elif len(prices) == 2:
                            for i in range(2):
                                if prices[i][1] not in exclude_seller:
                                    book_info.append(prices[i][0])
                        else:
                            count = 0
                            for i in range(len(prices)):
                                if prices[i][1] not in exclude_seller:
                                    book_info.append(prices[i][0])
                                    count += 1
                                if count == 2:
                                    break


#                         #对店铺不做排除
#                         prices=re.findall('<td><span class="PDP_itemPrice">(.*?)</span></td>.*?<td><span class="PDP_itemPrice">(.*?)</span></td>',str(results[i]),re.S)
#                         if prices:
#                             book_info.append(prices[0][0])
#                             book_info.append(prices[0][1])
#                         else:
#                             prices=re.findall('<td><span class="PDP_itemPrice">(.*?)</span></td>',str(results[i]),re.S)
#                             if prices:
#                                 book_info.append(prices[0])
#                                 book_info.append('')

                        lock.acquire()
                        result_file.write("\t".join(book_info) + "\n")
                        result_file.flush()
                        lock.release()
                    print("success:", isbn)
                    lock.acquire()
                    success_url_file.write(url)
                    success_url_file.flush()
                    lock.release()
                else:
                    print("offshelf:", isbn)
                    lock.acquire()
                    offshelf_isbn_file.write(isbn + "\n")
                    offshelf_isbn_file.flush()
                    lock.release()
            except BaseException as e:
                print(isbn, e)
                logger2.exception(str(e))
        else:
            lock.acquire()
            not_crawl_file.write(url)
            not_crawl_file.flush()
            lock.release()
    except Exception as e:
        print('error:', str(e))
        logger2.error(str(e))
Example #17
0
def get_info(html, itemsId):

    items_json = {
        'registered_land':
        '',
        'isbn':
        '',
        'description':
        '',
        'weight':
        '0.0000',
        'ean':
        '',
        'mpn':
        '',
        'key_name':
        ' ',
        'price':
        '',
        'height':
        '0.0000',
        'currency':
        'USD',
        'brand':
        '',
        'length_class':
        '',
        'product_id':
        '',
        'category':
        '',
        'jan':
        '',
        'seller_id':
        '',
        'name':
        '',
        'keyword':
        '',
        'weight_class':
        'kg',
        'url':
        '',
        'key_attribute':
        '',
        'detail': {},
        'shipping':
        '',
        'orders':
        0,
        'reviews':
        '0',
        'width':
        '0.0000',
        'length':
        '0.0000',
        'location':
        '',
        'attributes': [{
            'price': '',
            'variation_id': '',
            'dictory': 'Ships From',
            'attributes': {
                'Ships From': ''
            },
            'image': [],
            'quantity': 99
        }],
        'category_id_path':
        '',
        'category_id':
        '',
        'upc':
        '',
        'image': []
    }

    #读取价格字段存入
    # price = re.findall("<meta itemprop='price' content='(.*?)'", html)[0]
    # items_json['price'] = price

    # print '----------------isbn--------------'
    items_json['isbn'] = itemsId

    # print '----------------price--------------'
    url = 'http://www.newegg.com/Product/MappingPrice2012.aspx?Item=' + itemsId
    html_price = get_html.get_html(url)
    price = re.findall(
        '<span class="price-was-data" style="display: none">(.*?)</span>',
        html_price)
    if price:
        price = price[0]
    else:
        price = 'None'
    items_json['price'] = price

    # print '----------------url--------------'
    items_json['url'] = url

    # print '----------------shipping--------------'
    ship = '0'
    ship = re.search(r'product_default_shipping_cost:\[(.*?)\]', html)
    if ship:
        ship = ship.group(1).replace("'", "")
    items_json['shipping'] = ship

    # print '----------------brand--------------'
    brand = ''
    brand = re.search(r'product_manufacture:\[(.*?)\]', html)
    if brand:
        brand = brand.group(1).replace("'", "")
    items_json['brand'] = brand

    # print '----------------name--------------'
    name = ''
    name_info = re.search(r'product_title:\[(.*?)\]', html)
    if name_info:
        name = name_info.group(1).replace("'", "").replace(
            '&amp;',
            '').replace("#34;",
                        "''").replace("#40;",
                                      "(").replace('#41;',
                                                   ')').replace("#47;", "\\")
        name = name.decode("ascii").encode("utf-8")
    items_json['name'] = name

    # print '----------------weight--------------'
    weight_number = 0.0000
    items_json['weight'] = weight_number

    # print '----------------weight_class--------------'
    weight_class = 'kg'
    items_json['weight_class'] = weight_class

    # print '----------------height--------------'
    height_number = 0.0000
    items_json['height'] = height_number

    # print '----------------width--------------'
    width = 0.0000
    items_json['width'] = width

    # print '----------------length_class--------------'
    length_class = 'cm'
    items_json['length_class'] = length_class

    # print '----------------product_id--------------'
    items_json['product_id'] = itemsId

    # print '----------------reviews--------------
    reviews = '0'
    items_json['reviews'] = reviews

    # print '----------------upc--------------'
    upc = ''
    items_json['upc'] = upc

    # print '----------------seller_id--------------'
    sellerId = ''
    items_json['seller_id'] = sellerId

    # print '----------------detail----------详描----'
    Specification = {}
    spct_info = re.search(r'<div id="Specs" class=.*?>(.*?)</div>', html, re.S)
    if spct_info is not None:
        spct_info = spct_info.group(1)
        spct_list = re.findall(r'<dl><dt>(.*?)</dt><dd>(.*?)</dd></dl>',
                               spct_info, re.S)
        if spct_list:
            for spct in spct_list:
                temp1 = re.sub(r'<[^>]+>', '', spct[0], re.S)
                temp2 = re.sub(r'<[^>]+>', '', spct[1], re.S)
                Specification[temp1] = temp2
    items_json['detail'] = Specification

    # Specification = str(Specification).replace('{', '').replace('}', '').replace("'", '').replace(',','').replace('<br>','')
    # if len(items_info['Specification']) >= 2000:
    #     items_info['Specification'] = items_info['Specification'][0:1999]

    image = []
    image_info = re.search(r'"imageSetImageList":"(.*?)"', html, re.S)
    image_list = ''
    if image_info is not None:
        image_list = image_info.group(1)
        image_all = image_list.split(',')
        for images in image_all:
            images = 'http://images17.newegg.com/is/image/newegg/' + images
            image.append(images)
    if image_info is None:
        image_info = re.search(r'"imageNameList":"(.*?)"\}', html, re.S)
        if image_info is not None:
            image_list = image_info.group(1)
            image_all = image_list.split(',')
            for images in image_all:
                if images != '"dfis360ImgFlag":"':
                    images = images.split('"')[0]
                    images = 'http://images10.newegg.com/ProductImage/' + images
                    image.append(images)
    items_json['image'] = image

    category_dict = {
        'Computer Systems': 'ID-CS-503',
        'Components': 'ID-C-504',
        'Electronics': 'ID-E-505',
        'Gaming': 'ID-G-506',
        'Networking': 'ID-N-507',
        'Office Solutions': 'ID-OS-508',
        'Software Services': 'ID-SS-509',
        'Automotive Industrial': 'ID-AI-510',
        'Home Tools': 'ID-HT-511',
        'Health Sports': 'ID-HS-512',
        'Apparel Accessories': 'ID-AA-513',
        'Hobbies Toys': 'ID-HT-514'
    }

    # print '----------------category--------------'
    category_string_html = re.findall(
        r'<div id="baBreadcrumbTop" style="max-width:1420px; margin:0px auto;">(.*?)</div>',
        html, re.S)
    category_html_list = re.findall(r'title="(.*?)"',
                                    str(category_string_html), re.S)
    print(category_html_list)
    category_string = ''
    for category_s in category_html_list[2:]:
        category_string = category_string + category_s + '>'
    items_json['category'] = category_string[:-1]

    # print '----------------category_id_path--------------'
    category_url_list = re.findall(r'href="(.*?)"', str(category_string_html),
                                   re.S)
    category_id_path = ''
    for url_path in category_url_list:
        url_path = url_path.split('?')[0]
        category_id_path = category_id_path + url_path.split('/')[-1] + '>'
    items_json['category_id_path'] = str(
        category_dict[category_html_list[2]]) + category_id_path[5:-1]

    # print '----------------category_id----------------'
    items_json['category_id'] = category_id_path.split('>')[-2]

    # print '----------------attributes----------------'
    items_json['attributes'][0]['price'] = str(price)
    items_json['attributes'][0]['variation_id'] = itemsId + '_' + itemsId
    items_json['attributes'][0]['image'] = image[0]

    # print '----------------description-------短描-------'
    details = []
    description_str = ''
    detail_info = re.search(r'<ul class="itemColumn">(.*?)</ul>', html, re.S)
    if detail_info is not None:
        detail_info = detail_info.group(1)
        detail_info = detail_info.replace('\r\n', '').replace('\t', '')
        detail = re.findall(r'<li.*?>(.*?)</li>', detail_info, re.S)
        for i in detail:
            i = i.strip()
            i = i.decode("ascii").encode("utf-8")
            details.append(i)
    #函数get_feature_dict
    for description in details:
        description_str = description_str + description + ';'
    items_json['description'] = description_str

    # print '----------------写入文件----------------'
    json_file = json.dumps(items_json)
    result_file.write(json_file + '\n')
    print('=============')
    print(items_json)
    result_file.flush()
Example #18
0
def get_info(items_info):  # 得到字段信息
    # items_result={'itemsId':'','price':'','ship':'','stock':'','image1':'','brand':'','title':'','product_attr':'','description':'','Specification':''}
    items_result = {
        'itemsId': '',
        'price': '',
        'ship': '',
        'stock': '',
        'image1': '',
        'brand': '',
        'title': '',
        'product_attr': '',
        'Specification': ''
    }
    price = ''
    if items_info.has_key('salePrice'):
        price = items_info['salePrice']
    items_result['price'] = price

    sku = ''
    if items_info.has_key('itemId'):
        sku = items_info['itemId']
    items_result['itemsId'] = sku
    items_url = 'http://www.walmart.com/ip/' + str(sku)
    item_html = ''
    try:
        items_html = get_html.get_html(items_url)
    except Exception as e:
        print(e)

    stock = ''
    if items_info.has_key('stock'):
        stock_info = items_info['stock']
        if stock_info == 'Not Available' or stock_info == 'Not available':
            stock = 'out of stock'
        if stock_info == 'Available':
            stock = 'in stock'
    items_result['stock'] = stock

    # description=''
    # if items_info.has_key('longDescription'):
    #     feature_all=items_info['longDescription'].strip()
    #     html = HTMLParser.HTMLParser()
    #     feature_html = html.unescape(feature_all)
    #     feature_info=re.sub(r'<[^>]+>', '', feature_html)
    #     description=feature_info
    # if len(description)>=2000:#判断,如果字符大于2000个则取前2000个
    #     description=description[:2000]
    # items_result['description']=description

    ship = '0'
    if items_info.has_key('standardShipRate'):
        ship = items_info['standardShipRate']
    items_result['ship'] = ship

    feature = ''
    if items_info.has_key('shortDescription'):
        feature = items_info['shortDescription']
    items_result['feature'] = feature

    title = ''
    if items_info.has_key('name'):
        title = items_info['name']
    items_result['title'] = title

    brand = ''
    if items_info.has_key('brandName'):
        brand = items_info['brandName']
    else:
        brand_info = re.search(r'<span itemprop=brand>(.*?)</span>',
                               items_html, re.S)
        if brand_info != None:
            brand = brand_info.group(1)

    items_result['brand'] = brand

    reviews = ''
    if items_info.has_key('numReviews'):
        reviews = items_info['numReviews']
    items_result['reviews'] = reviews

    product_attr = ''

    if items_info.has_key('attributes'):
        product_attr = items_info['attributes']
    items_result['product_attr'] = product_attr

    feature_list = []
    if items_info.has_key('shortDescription'):
        short_description_info = items_info['shortDescription']
        html = HTMLParser.HTMLParser()
        short_description_html = html.unescape(short_description_info)
        short_description = re.sub(r'<[^>]+>', '', short_description_html)
        feature_list = short_description.split('.')
    feature_dic = get_feature_dict(feature_list)

    Specifications = {}
    Specifications_list = re.findall(
        r'<tr class=js-product-specs-row>[\s]*<td>(.*?)</td>[\s]*<td>(.*?)</td>[\s]*</tr>',
        items_html, re.S)
    if Specifications_list != []:
        for Specifications_info in Specifications_list:
            tmp1 = re.sub(r'<[^>]+>', '', Specifications_info[0], re.S)
            tmp2 = re.sub(r'<[^>]+>', '', Specifications_info[1], re.S)
            Specifications[tmp1] = tmp2
        if len(str(Specifications)) > 1000:
            Specifications = str(Specifications)[:1000]
    items_result['Specification'] = Specifications

    if items_info.has_key('largeImage'):
        image = items_info['largeImage']
    items_result['img1'] = image

    items_dic = dict(items_result.items() + feature_dic.items())
    return items_dic
Example #19
0
#             data = gzipper.read()
#             html = data
#         return html
#     except Exception as e:
#         print 'error:', str(e)
#         logger2.error(str(e))
#         get_html(url)
'''
  获取书籍品类url
'''
if __name__ == '__main__':
    if os.path.exists('./result')==0:
        os.mkdir('./result')
    logger1.info('get category url begin')
    with open('./result/cate_url.txt','aw') as f:
        try:
            f.truncate(0)
            base_category_url = 'http://books.half.ebay.com/'
            html=get_html.get_html(base_category_url)
            separator=re.findall('<div class="separator">.*?<ul class="metascateitems">(.*?)</ul>',html,re.S)[0].replace('\n','').replace('\t','').replace('\r','')
            cate_urls=re.findall('<li><a href="(.*?)">(.*?)</a></li>',separator)
            for cate_url in cate_urls:
                url=cate_url[0]
                cate=cate_url[1].replace('&amp;','&')
                f.write(url + '|'+cate + '\n')
            print ('-----------get category url successfuly--------------')
        except Exception as e:
            print ('error:', str(e))
            logger2.error(str(e))
    logger1.info('get category url success')
Example #20
0
def get_info(html, itemsId):

    items_info = {
        'itemsId': itemsId,
        'price': '',
        'Original_price': '',
        'ship': '',
        'stock': '',
        'brand': '',
        'title': '',
        'Specification': ''
    }

    #读取价格字段存入
    # print '---------------------price----------------------'
    price = re.findall("<meta itemprop='price' content='(.*?)'", html)[0]
    items_info['price'] = price

    #售价页
    # print '---------------------Original_price----------------------'
    url = 'http://www.newegg.com/Product/MappingPrice2012.aspx?Item=' + itemsId
    html_price = get_html.get_html(url)
    orgin_price = re.findall(
        '<span class="price-was-data" style="display: none">(.*?)</span>',
        html_price)
    if orgin_price:
        orgin_price = orgin_price[0]
    else:
        orgin_price = 'None'

    items_info['Original_price'] = orgin_price

    # print '---------------------ship----------------------'
    ship = '0'
    ship = re.search(r'product_default_shipping_cost:\[(.*?)\]', html)
    if ship:
        ship = ship.group(1).replace("'", "")
    items_info['ship'] = ship

    # print '---------------------stock----------------------'
    stock = ''
    stock_info = re.search(r'product_instock:\[(.*?)\]', html)
    if stock_info:
        stock_info = stock_info.group(1).replace("'", "")
        if int(stock_info) == 1:
            stock = 'In stock'
        else:
            stock = 'out of stock'
    items_info['stock'] = stock

    # print '---------------------brand----------------------'
    brand = ''
    brand = re.search(r'product_manufacture:\[(.*?)\]', html)
    if brand:
        brand = brand.group(1).replace("'", "")
    items_info['brand'] = brand

    # print '---------------------name----------------------'
    name = ''
    name_info = re.search(r'product_title:\[(.*?)\]', html)
    if name_info:
        name = name_info.group(1).replace("'", "").replace(
            '&amp;',
            '').replace("#34;",
                        "''").replace("#40;",
                                      "(").replace('#41;',
                                                   ')').replace("#47;", "\\")
        name = name.decode("ascii").encode("utf-8")
    items_info['title'] = name

    # print '---------------------Specification----------------------'
    Specification = {}
    spct_info = re.search(r'<div id="Specs" class=.*?>(.*?)</div>', html, re.S)
    if spct_info is not None:
        spct_info = spct_info.group(1)
        spct_list = re.findall(r'<dl><dt>(.*?)</dt><dd>(.*?)</dd></dl>',
                               spct_info, re.S)
        if spct_list:
            for spct in spct_list:
                temp1 = re.sub(r'<[^>]+>', '', spct[0], re.S)
                temp2 = re.sub(r'<[^>]+>', '', spct[1], re.S)
                Specification[temp1] = temp2
    items_info['Specification'] = str(Specification).replace('{', '').replace(
        '}', '').replace("'", '').replace(',', '<br>')
    if len(items_info['Specification']) >= 2000:
        items_info['Specification'] = items_info['Specification'][0:1999]

    # print '---------------------image----------------------'
    image = []
    image_info = re.search(r'"imageSetImageList":"(.*?)"', html, re.S)
    image_list = ''
    if image_info is not None:
        image_list = image_info.group(1)
        image_all = image_list.split(',')
        for images in image_all:
            images = 'http://images17.newegg.com/is/image/newegg/' + images
            image.append(images)
    if image_info is None:
        image_info = re.search(r'"imageNameList":"(.*?)"\}', html, re.S)
        if image_info is not None:
            image_list = image_info.group(1)
            image_all = image_list.split(',')
            for images in image_all:
                if images != '"dfis360ImgFlag":"':
                    images = images.split('"')[0]
                    images = 'http://images10.newegg.com/ProductImage/' + images
                    image.append(images)
    image_dict = get_img_dict(image)

    # print '---------------------details----------------------'
    details = []
    detail_info = re.search(r'<ul class="itemColumn">(.*?)</ul>', html, re.S)
    if detail_info is not None:
        detail_info = detail_info.group(1)
        detail_info = detail_info.replace('\r\n', '').replace('\t', '')
        detail = re.findall(r'<li.*?>(.*?)</li>', detail_info, re.S)
        for i in detail:
            i = i.strip()
            i = i.decode("ascii").encode("utf-8")
            details.append(i)
    #函数get_feature_dict
    detail_dict = get_feature_dict(details)
    items_dict = dict(items_info.items() + image_dict.items() +
                      detail_dict.items())  # 所有字典合并
    for k in titles:
        if items_dict.has_key(k):
            value = items_dict.get(k)
        else:
            value = 'None'
        lock.acquire()
        result_file.write(str(value) + "\t")
        result_file.flush()
        lock.release()

    lock.acquire()
    result_file.write('\n')
    result_file.flush()
    lock.release()
Example #21
0
def get_category():

    print('')
    global category_file
    category_file = open('./result/category.txt', 'w')
    category_id_file = open('./result/category_id.txt', 'w')
    file = {
        '1': 'I    ',
        '2': 'II    ',
        '3': 'III    ',
        '4': 'IV    ',
        '5': 'V    '
    }

    key = [
        'f7fqv4jzcdr7ccfb2b339cv9', 'fmwnnrwf53d6c5sw7b4pu2q3',
        '7sd4rpjfmdurwuwzgvpbffd2', 'jqpyjz92jmaruene4mpbe8pc',
        'z2pqv4dtuwhxe3hkesx9kqpv', 'nz2gzu5byp9dbnm6jee69jkp',
        'sfpw74s5yte8dj9r9atzyc5m ', '2tk5sghn56mnth5uabspkdt6'
    ]
    url = 'http://api.walmartlabs.com/v1/taxonomy?apiKey=2tk5sghn56mnth5uabspkdt6'
    info = get_html.get_html(url)
    info = eval(info)
    categories = info['categories']

    print(len(categories))
    for category in categories:
        category1 = category['name']
        category1_id = category['id']
        category_file.write(file['1'] + category1 + '\t' + category1_id + '\n')
        category_file.flush()
        category2_info = category['children']
        print(category2_info)
        for category2_all in category2_info:
            category2 = category2_all['name']
            category2_id = category2_all['id']
            category_file.write('\t' + file['2'] + category2 + '\t' +
                                category2_id + '\n')
            category_file.flush()
            category3_info = category['children']
            for category3_all in category3_info:
                category3 = category3_all['name']
                category3_id = category3_all['id']
                category_file.write('\t\t' + file['3'] + category3 + '\t' +
                                    category3_id + '\n')
                category_file.flush()
                if category3_all.has_key('children'):
                    category4_info = category3_all['children']
                    for category4_all in category4_info:
                        category4 = category4_all['name']
                        category4_id = category4_all['id']
                        category_file.write('\t\t\t' + category4 + '\t' +
                                            category4_id + '\n')
                        category_file.flush()
                        category_id_file.write(category4_id + '\n')
                        category_id_file.flush()
                        if category4_all.has_key('children'):
                            category5_info = category4_all['children']
                            print(category5_info)
                else:
                    category_id_file.write(category3_id + '\n')
                    category_id_file.flush()
    category_file.close()
    category_id_file.close()
Example #22
0
def get_info(items_info):  # 得到字段信息
    # items_result = {'itemsId': '', 'price': '', 'ship': '', 'stock': '', 'image1': '', 'brand': '', 'title': '',
    #                 'product_attr': '', 'Specification': ''}

    items_json = {
        'registered_land':
        '',
        'isbn':
        '',
        'description':
        '',
        'weight':
        '0.0000',
        'ean':
        '',
        'mpn':
        '',
        'key_name':
        ' ',
        'price':
        '',
        'height':
        '0.0000',
        'currency':
        'USD',
        'brand':
        '',
        'length_class':
        '',
        'product_id':
        '',
        'category':
        '',
        'jan':
        '',
        'seller_id':
        '',
        'name':
        '',
        'keyword':
        '',
        'weight_class':
        'kg',
        'url':
        '',
        'key_attribute':
        '',
        'detail': {},
        'shipping':
        '',
        'orders':
        0,
        'reviews':
        '0',
        'width':
        '0.0000',
        'length':
        '0.0000',
        'location':
        '',
        'attributes': [{
            'price': '',
            'variation_id': '',
            'dictory': 'Ships From',
            'attributes': {
                'Ships From': ''
            },
            'image': [],
            'quantity': 99
        }],
        'category_id_path':
        '',
        'category_id':
        '',
        'upc':
        '',
        'image': []
    }

    # print '----------------isbn--------------'
    sku = ''
    if items_info.has_key('itemId'):
        sku = items_info['itemId']
    items_json['isbn'] = sku

    # print '----------------product_id--------------'
    items_json['product_id'] = sku

    # print '----------------price--------------'
    price = ''
    if items_info.has_key('salePrice'):
        price = items_info['salePrice']
    items_json['price'] = str(price)

    items_url = 'http://www.walmart.com/ip/' + str(sku)
    item_html = ''
    try:
        items_html = get_html.get_html(items_url)
    except Exception as e:
        print(e)

    # stock = ''
    # if items_info.has_key('stock'):
    #     stock_info = items_info['stock']
    #     if stock_info == 'Not Available' or stock_info == 'Not available':
    #         stock = 'out of stock'
    #     if stock_info == 'Available':
    #         stock = 'in stock'
    # items_result['stock'] = stock

    # description=''
    # if items_info.has_key('longDescription'):
    #     feature_all=items_info['longDescription'].strip()
    #     html = HTMLParser.HTMLParser()
    #     feature_html = html.unescape(feature_all)
    #     feature_info=re.sub(r'<[^>]+>', '', feature_html)
    #     description=feature_info
    # if len(description)>=2000:#判断,如果字符大于2000个则取前2000个
    #     description=description[:2000]
    # items_result['description']=description

    # print '----------------shipping--------------'
    ship = '0'
    if items_info.has_key('standardShipRate'):
        ship = items_info['standardShipRate']
    items_json['shipping'] = ship

    # feature = ''
    # if items_info.has_key('shortDescription'):
    #     feature = items_info['shortDescription']
    # items_result['feature'] = feature

    # print '----------------name--------------'
    title = ''
    if items_info.has_key('name'):
        title = items_info['name']
    items_json['name'] = title

    # print '----------------brand--------------'
    brand = 'no brand'
    if items_info.has_key('brandName'):
        brand = items_info['brandName']
    else:
        brand_info = re.search(r'<span itemprop=brand>(.*?)</span>',
                               items_html, re.S)
        if brand_info != None:
            brand = brand_info.group(1)
    items_json['brand'] = brand

    # print '----------------category--------------'
    category_path = items_info['categoryPath']
    category_string_path = str(category_path).replace('/', '>')
    items_json['category'] = category_string_path

    # print '----------------seller_id--------------'
    sellerId = ''
    items_json['seller_id'] = sellerId

    # print '----------------reviews--------------
    reviews = ''
    if items_info.has_key('numReviews'):
        reviews = items_info['numReviews']
    items_json['reviews'] = reviews

    # print '----------------category_id_path--------------'
    category_id_path = items_info['categoryNode']
    category_id_path = str(category_id_path).replace('_', '>')
    items_json['category_id_path'] = category_id_path[:-1]

    # print '----------------category_id----------------'
    category_id = category_id_path.split('_')[-1]
    items_json['category_id'] = category_id

    # print '----------------upc--------------'
    items_json['upc'] = items_info['upc']

    # product_attr = ''
    # if items_info.has_key('attributes'):
    #     product_attr = items_info['attributes']
    # items_result['product_attr'] = product_attr

    # print '----------------description-------短描-------'
    feature_list = []
    description_str = ''
    if items_info.has_key('shortDescription'):
        short_description_info = items_info['shortDescription']
        html = HTMLParser.HTMLParser()
        short_description_html = html.unescape(short_description_info)
        short_description = re.sub(r'<[^>]+>', '', short_description_html)
        feature_list = short_description.split('.')

    for description in feature_list:
        description_str = description_str + description + ';'
    items_json['description'] = description_str
    # feature_dic = get_feature_dict(feature_list)

    # print '----------------weight--------------'
    weight_number = 0.0000
    items_json['weight'] = weight_number

    # print '----------------height--------------'
    height_number = 0.0000
    items_json['height'] = height_number

    # print '----------------width--------------'
    width = 0.000
    items_json['width'] = width

    # print '----------------length_class--------------'
    length_class = 'cm'
    items_json['length_class'] = length_class

    # print '----------------weight_class--------------'
    weight_class = 'kg'
    items_json['weight_class'] = weight_class

    # print '----------------url--------------'
    url = items_info['productUrl']
    # url = items_info['url']
    items_json['url'] = str(url)

    # print '----------------detail----------详描----'
    Specifications = {}
    Specifications_list = re.findall(
        r'<tr class=js-product-specs-row>[\s]*<td>(.*?)</td>[\s]*<td>(.*?)</td>[\s]*</tr>',
        items_html, re.S)
    if Specifications_list != []:
        for Specifications_info in Specifications_list:
            tmp1 = re.sub(r'<[^>]+>', '', Specifications_info[0], re.S)
            tmp2 = re.sub(r'<[^>]+>', '', Specifications_info[1], re.S)
            Specifications[tmp1] = tmp2

    items_json['detail'] = str(Specifications)

    # print '----------------image----------------'
    image = ''
    if items_info.has_key('largeImage'):
        image = items_info['largeImage']
    items_json['image'] = image

    # print '----------------attributes----------------'
    items_json['attributes'][0]['price'] = str(price)
    variation_id = str(items_info['parentItemId']) + '_' + str(
        items_info['itemId'])
    items_json['attributes'][0]['variation_id'] = str(variation_id)
    items_json['attributes'][0]['image'] = image[0]
    # "attributes": [
    #     {"price": "", "variation_id": "", "dictory": "Ships From", "attributes": {"Ships From": ""}, "image": [],
    #      "quantity": 99}]
    json_file = json.dumps(items_json)
    # items_dic = dict(items_result.items() + feature_dic.items())
    return json_file