Exemple #1
0
def sixlucky_parse_list_file(main_cat, sub_cat, list_file, db_name = 'test.db'):
    
    printf("\n==> Start to parse list file: %s ...", list_file)

    global commodity_index

    if list_file != '':
        f = open( list_file , 'r' )
        html_text = f.read()
        f.close()

    if len(html_text) <= 0:
        return

    ret = {}
    ret = sixlucky_parse_pages.parse_list_page(html_text, True, ret)

    list_dump = {'list_file' : list_file}
    list_dump['commodities'] = []

    for commodity_url in ret['link']:
        try:
            printf('commodity_index: %d', commodity_index)
            m = {'main_cat' : main_cat, 'sub_cat' : sub_cat}
            m = sixlucky_dump_and_parse_commodity(commodity_url, main_cat, sub_cat, commodity_index, m)
            list_dump['commodities'].append(m)

            commodity_index += 1
            #if i >= 3:
            #    break # dump leading 3 only because we're still debugging

        except Exception, e:
            printf("Parse commodity fails ...")
            traceback.print_exc()
        finally:
Exemple #2
0
def sixlucky_dump_sub_category(url, main_cat, sub_cat):

    printf("\n==> Start to parse category url: %s, main_cat: %s, sub_cat: %s ...", url, main_cat, sub_cat)

    global sixlucky_working_directory
    i = 0

    # Category with Referer
    #url = 'http://iqc.com.tw/List/2/'

    # Return here for debugging
    #return
    printf("List is %d", str.find(url, "List"))
    sixlucky_working_directory = url[str.find(url, "List"):]

    html_text = sixlucky_dump_list_page(url, main_cat = main_cat, sub_cat = sub_cat, index = i)

    # Parse html page
    if len(html_text) <= 0:
        print("Can not get list page html_text, len = 0")
        return

    ret = sixlucky_parse_pages.parse_list_page(html_text, True, ret={})

    print("ret['total_product_counts'] = %d", ret['total_product_counts'])

    # Calculate pages in this category
    page_count = 0
    if ret['total_product_counts'] > 0:
        page_count = ret['total_product_counts']/18 + 1

    if page_count > 10:
        page_count = 10

    # Fetch page from 2 ~ end
    for i in range(1, page_count):
        time.sleep(random.random()*10)
        url = "http://www.6lucky.com.tw/showroom/" + ret["list_link"][i]
        html_text = sixlucky_dump_list_page(url, main_cat = main_cat, sub_cat = sub_cat, index = i)