def sixlucky_parse_list_file(main_cat, sub_cat, list_file, db_name = 'test.db'): printf("\n==> Start to parse list file: %s ...", list_file) global commodity_index if list_file != '': f = open( list_file , 'r' ) html_text = f.read() f.close() if len(html_text) <= 0: return ret = {} ret = sixlucky_parse_pages.parse_list_page(html_text, True, ret) list_dump = {'list_file' : list_file} list_dump['commodities'] = [] for commodity_url in ret['link']: try: printf('commodity_index: %d', commodity_index) m = {'main_cat' : main_cat, 'sub_cat' : sub_cat} m = sixlucky_dump_and_parse_commodity(commodity_url, main_cat, sub_cat, commodity_index, m) list_dump['commodities'].append(m) commodity_index += 1 #if i >= 3: # break # dump leading 3 only because we're still debugging except Exception, e: printf("Parse commodity fails ...") traceback.print_exc() finally:
def sixlucky_dump_sub_category(url, main_cat, sub_cat): printf("\n==> Start to parse category url: %s, main_cat: %s, sub_cat: %s ...", url, main_cat, sub_cat) global sixlucky_working_directory i = 0 # Category with Referer #url = 'http://iqc.com.tw/List/2/' # Return here for debugging #return printf("List is %d", str.find(url, "List")) sixlucky_working_directory = url[str.find(url, "List"):] html_text = sixlucky_dump_list_page(url, main_cat = main_cat, sub_cat = sub_cat, index = i) # Parse html page if len(html_text) <= 0: print("Can not get list page html_text, len = 0") return ret = sixlucky_parse_pages.parse_list_page(html_text, True, ret={}) print("ret['total_product_counts'] = %d", ret['total_product_counts']) # Calculate pages in this category page_count = 0 if ret['total_product_counts'] > 0: page_count = ret['total_product_counts']/18 + 1 if page_count > 10: page_count = 10 # Fetch page from 2 ~ end for i in range(1, page_count): time.sleep(random.random()*10) url = "http://www.6lucky.com.tw/showroom/" + ret["list_link"][i] html_text = sixlucky_dump_list_page(url, main_cat = main_cat, sub_cat = sub_cat, index = i)