def PRODUCT_REVIEW_SCRAPE(self): try: start_time = time.clock() item_ProductId = get_review_productId() item_ProductId_1 = item_ProductId.get_review_ProductId() for product_id in item_ProductId_1: print product_id[0] for i in range(20): latestUrl = "https://sclub.jd.com/comment/productPageComments.action?productId=" + product_id[ 0] + "&score=0&sortType=5&page=" + str( i) + "&pageSize=10" html = get_text() html_1 = html.get_json_text(latestUrl) if (html_1): processor = reviews_analysis() jsontext = processor.process(html_1, latestUrl) if (jsontext): # 连接数据库 db = DB_connection('localhost', 3306, 'root', '123123', 'All_Scraper', 'utf8') conn = db.connects() mapper = DB_operation(conn) for jdreview in jsontext: # sql_reviews_insert = review_sql_joint() sql_reviews_insert_1 = reviews_insert_sql_joint( jdreview) print sql_reviews_insert_1 mapper.insert(sql_reviews_insert_1) conn.commit() conn.close() except Exception as err: print err all_time = (time.clock() - start_time) print("total time is:", all_time)
def get_review_ProductId(self): try: Product_Id = [] # 连接数据库 db = DB_connection('localhost', 3306, 'root', '123123', 'All_Scraper', 'utf8') conn = db.connects() mapper = DB_operation(conn) # 将url提取出来 sql_get_ProductId_url = "SELECT sku FROM All_Scraper.jd_keywords;" Product_Id = mapper.select(sql_get_ProductId_url) conn.commit() conn.close() return Product_Id except Exception as err: print err
def get_keywords(self): try: keywords = [] # 连接数据库 db = DB_connection('localhost', 3306, 'root', '123123', 'All_Scraper', 'utf8') conn = db.connects() mapper = DB_operation(conn) # 将url提取出来 sql_get_keywords = "SELECT keyword FROM All_Scraper.search_keywords limit 100;" keywords = mapper.select(sql_get_keywords) conn.commit() conn.close() return keywords except Exception as err: print err
def get_sellerpage_Url(self): try: sellerpage_Url = [] # 连接数据库 db = DB_connection('localhost', 3306, 'root', '123123', 'All_Scraper', 'utf8') conn = db.connects() mapper = DB_operation(conn) # 将url提取出来 sql_get_sellerpage_Url = "SELECT shop_id FROM All_Scraper.jd_products;" sellerpage_Url = mapper.select(sql_get_sellerpage_Url) conn.commit() conn.close() return sellerpage_Url except Exception as err: print err
def SELLER_PAGE_SCRAPE(self): try: start_time = time.clock() # 获取拼接url的shop_id(https://mall.jd.com/shopLevel-shop_id.html) item_sellerpage = get_Seller_Page_Url() item_sellerpage_1 = item_sellerpage.get_sellerpage_Url() # 登录京东 login = login_jd() login.login_JD() if (len(item_sellerpage_1) > 0): for sellerpage in item_sellerpage_1: try: print(sellerpage[0]) if (sellerpage[0] == None): continue except: continue # 拼接sellerpage页面的url sellerpage_url = "https:" + "//mall.jd.com/shopLevel-" + sellerpage[ 0] + ".html" # 获取页面的网页源代码 html = get_text() html_1 = html.get_html_text_1(sellerpage_url) if (html_1): # 对源代码进行解析,提取 processor = seller_page_analysis() sellerpage_text = processor.process( html_1, sellerpage_url, sellerpage[0]) if (sellerpage_text): # 连接数据库 db = DB_connection('localhost', 3306, 'root', '123123', 'All_Scraper', 'utf8') conn = db.connects() mapper = DB_operation(conn) for i in sellerpage_text: # 将数据插入jd_seller_page数据表中 sql_sellerpage_insert_1 = sellerpage_insert_sql_joint( i) print sql_sellerpage_insert_1 mapper.insert(sql_sellerpage_insert_1) # 关闭数据库连接 conn.commit() conn.close() except Exception as err: print err
def PRODUCTS_SCRAPE(self): try: start_time = time.clock() # 从jd_keywords表里面获取需要的sku拼接url item_product_sku = get_product_url() item_product_sku_1 = item_product_sku.get_Product_Url() # 登录京东操作 login = login_jd() login.login_JD() for product_sku in item_product_sku_1: print(product_sku[0]) url = "https://item.jd.com/" + str(product_sku[0]) + ".html" # 根据url获取网页源码 html = get_text() html_1 = html.get_html_text_1(url) try: # if(html_1): # #解析操作 # processor=products_analysis() # data_1=processor.process(html_1) # if(data_1): # for data in data_1: # for key,value in data.items(): # # 如果解析里有shop_id # if (key == 'shop_id' and value != ""): # # 找到数据库中看有没有这一条记录 # shopid_value = value # result = products_update_select(shopid_value) # # 有的话 # if (result): # # 更新操作 # products_update_operation(data, shopid_value) # else: # # 插入操作 # products_insert_operation(data) if (html_1): # 解析操作 processor = products_analysis() data = processor.process(html_1, product_sku[0]) # 连接数据库 db = DB_connection('localhost', 3306, 'root', '123123', 'All_Scraper', 'utf8') conn = db.connects() mapper = DB_operation(conn) if (data): for data_1 in data: # 插入到jd_products数据表中的数据 sql_jd_products = products_insert_sql_joint( data_1) for img in data_1['image_url']: img_url = "https:" + img sql_jd_product_image = products_image_insert_sql_joint( data_1, img_url) # mapper.insert(sql_jd_product_image) # mapper.insert(sql_jd_products) print sql_jd_products print sql_jd_product_image mapper.insert(sql_jd_products) mapper.insert(sql_jd_product_image) conn.commit() conn.close() except Exception as err: print err except Exception as err: print err all_time = (time.clock() - start_time) print("total time is:", all_time)