def cat_parser(): # print outer_text outer_resp = http_call(cat_url) outer_text = read_body(outer_resp) cat_infos = {} outer_datas = cat_pattern.findall(outer_text) for cat_id, cat_name in outer_datas: try: cat_infos['cat_id'] = cat_id cat_infos['cat_name'] = cat_name cat_info = tb_20130112_category_info(**cat_infos) print cat_id cat_info.save() except Exception as e: print traceback.format_exc()
def suppliers_outer_parser(outer_text): # print outer_text suppliers_infos = {} outer_datas = suppliers_outer_pattern.findall(outer_text) for supplier_id, url, region in outer_datas: try: resp = http_call(url + "cominfo.html") suppliers_infos['supplier_id'] = supplier_id suppliers_infos['region'] = region suppliers_infos.update(suppliers_inner_parser(read_body(resp))) suppliers_info = tb_20130112_supplies_info(**suppliers_infos) print url suppliers_info.save() print '已经抓取 ' + supplier_id except Exception as e: print traceback.format_exc()
def suppliers_worker(): while not suppliers_queue.empty(): try: url = suppliers_queue.get() print url outer_resp = http_call(url) outer_text = read_body(outer_resp) next_page_url = next_page_pattern.findall(outer_text) if next_page_url: url = BASE_URL + next_page_url[0] print 'Next: ' + url suppliers_queue.put(url) suppliers_outer_parser(outer_text) except Exception: print traceback.format_exc() finally: suppliers_queue.task_done() gevent.sleep(0.0)
def products_outer_parser(outer_text): # print outer_text products_infos = {} outer_datas = products_outer_pattern.findall(outer_text) for products_id, url, name, cas_id, suppliers_url_id in outer_datas: try: resp = http_call(BASE_URL + url) products_infos['products_id'] = products_id products_infos['name'] = name products_infos['cas_id'] = cas_id products_infos['suppliers_url_id'] = suppliers_url_id products_infos.update(products_inner_parser(read_body(resp))) suppliers_info = tb_20130112_products_info(**products_infos) print url suppliers_info.save() print '已经抓取 ' + products_id except Exception as e: print traceback.format_exc()