Exemple #1
0
def cat_parser():
#    print outer_text
    outer_resp = http_call(cat_url)
    outer_text = read_body(outer_resp)
    cat_infos = {}
    outer_datas = cat_pattern.findall(outer_text)
    for cat_id, cat_name in outer_datas:
        try:
            cat_infos['cat_id'] = cat_id
            cat_infos['cat_name'] = cat_name
            cat_info = tb_20130112_category_info(**cat_infos)
            print cat_id
            cat_info.save()
        except Exception as e:
            print traceback.format_exc()
Exemple #2
0
def suppliers_outer_parser(outer_text):
#    print outer_text
    suppliers_infos = {}
    outer_datas = suppliers_outer_pattern.findall(outer_text)
    for supplier_id, url, region in outer_datas:
        try:
            resp = http_call(url + "cominfo.html")
            suppliers_infos['supplier_id'] = supplier_id
            suppliers_infos['region'] = region
            suppliers_infos.update(suppliers_inner_parser(read_body(resp)))
            suppliers_info = tb_20130112_supplies_info(**suppliers_infos)
            print url
            suppliers_info.save()
            print '已经抓取 ' + supplier_id
        except Exception as e:
            print traceback.format_exc()
Exemple #3
0
def suppliers_worker():
    while not suppliers_queue.empty():
        try:
            url = suppliers_queue.get()
            print url
            outer_resp = http_call(url)
            outer_text = read_body(outer_resp)
            next_page_url = next_page_pattern.findall(outer_text)
            if next_page_url:
                url = BASE_URL + next_page_url[0]
                print 'Next: ' + url
                suppliers_queue.put(url)
            suppliers_outer_parser(outer_text)
        except Exception:
            print traceback.format_exc()
        finally:
            suppliers_queue.task_done()
            gevent.sleep(0.0)
Exemple #4
0
def products_outer_parser(outer_text):
#    print outer_text
    products_infos = {}
    outer_datas = products_outer_pattern.findall(outer_text)
    for products_id, url, name, cas_id, suppliers_url_id in outer_datas:
        try:
            resp = http_call(BASE_URL + url)
            products_infos['products_id'] = products_id
            products_infos['name'] = name
            products_infos['cas_id'] = cas_id
            products_infos['suppliers_url_id'] = suppliers_url_id
            products_infos.update(products_inner_parser(read_body(resp)))
            suppliers_info = tb_20130112_products_info(**products_infos)
            print url
            suppliers_info.save()
            print '已经抓取 ' + products_id
        except Exception as e:
            print traceback.format_exc()