Esempio n. 1
0
    def PRODUCT_REVIEW_SCRAPE(self):
        try:
            start_time = time.clock()
            item_ProductId = get_review_productId()
            item_ProductId_1 = item_ProductId.get_review_ProductId()
            for product_id in item_ProductId_1:
                print product_id[0]
                for i in range(20):
                    latestUrl = "https://sclub.jd.com/comment/productPageComments.action?productId=" + product_id[
                        0] + "&score=0&sortType=5&page=" + str(
                            i) + "&pageSize=10"
                    html = get_text()
                    html_1 = html.get_json_text(latestUrl)
                    if (html_1):
                        processor = reviews_analysis()
                        jsontext = processor.process(html_1, latestUrl)
                        if (jsontext):
                            # 连接数据库
                            db = DB_connection('localhost', 3306, 'root',
                                               '123123', 'All_Scraper', 'utf8')
                            conn = db.connects()
                            mapper = DB_operation(conn)
                            for jdreview in jsontext:
                                # sql_reviews_insert = review_sql_joint()
                                sql_reviews_insert_1 = reviews_insert_sql_joint(
                                    jdreview)
                                print sql_reviews_insert_1
                                mapper.insert(sql_reviews_insert_1)
                            conn.commit()
                            conn.close()

        except Exception as err:
            print err
        all_time = (time.clock() - start_time)
        print("total time is:", all_time)
Esempio n. 2
0
 def get_review_ProductId(self):
     try:
         Product_Id = []
         # 连接数据库
         db = DB_connection('localhost', 3306, 'root', '123123', 'All_Scraper', 'utf8')
         conn = db.connects()
         mapper = DB_operation(conn)
         # 将url提取出来
         sql_get_ProductId_url = "SELECT sku FROM All_Scraper.jd_keywords;"
         Product_Id = mapper.select(sql_get_ProductId_url)
         conn.commit()
         conn.close()
         return Product_Id
     except Exception as err:
         print err
Esempio n. 3
0
 def get_keywords(self):
     try:
         keywords = []
         # 连接数据库
         db = DB_connection('localhost', 3306, 'root', '123123', 'All_Scraper', 'utf8')
         conn = db.connects()
         mapper = DB_operation(conn)
         # 将url提取出来
         sql_get_keywords = "SELECT keyword FROM All_Scraper.search_keywords limit 100;"
         keywords = mapper.select(sql_get_keywords)
         conn.commit()
         conn.close()
         return keywords
     except Exception as err:
         print err
Esempio n. 4
0
 def get_sellerpage_Url(self):
     try:
         sellerpage_Url = []
         # 连接数据库
         db = DB_connection('localhost', 3306, 'root', '123123', 'All_Scraper', 'utf8')
         conn = db.connects()
         mapper = DB_operation(conn)
         # 将url提取出来
         sql_get_sellerpage_Url = "SELECT shop_id FROM All_Scraper.jd_products;"
         sellerpage_Url = mapper.select(sql_get_sellerpage_Url)
         conn.commit()
         conn.close()
         return sellerpage_Url
     except Exception as err:
         print err
Esempio n. 5
0
 def SELLER_PAGE_SCRAPE(self):
     try:
         start_time = time.clock()
         # 获取拼接url的shop_id(https://mall.jd.com/shopLevel-shop_id.html)
         item_sellerpage = get_Seller_Page_Url()
         item_sellerpage_1 = item_sellerpage.get_sellerpage_Url()
         # 登录京东
         login = login_jd()
         login.login_JD()
         if (len(item_sellerpage_1) > 0):
             for sellerpage in item_sellerpage_1:
                 try:
                     print(sellerpage[0])
                     if (sellerpage[0] == None):
                         continue
                 except:
                     continue
                 # 拼接sellerpage页面的url
                 sellerpage_url = "https:" + "//mall.jd.com/shopLevel-" + sellerpage[
                     0] + ".html"
                 # 获取页面的网页源代码
                 html = get_text()
                 html_1 = html.get_html_text_1(sellerpage_url)
                 if (html_1):
                     # 对源代码进行解析,提取
                     processor = seller_page_analysis()
                     sellerpage_text = processor.process(
                         html_1, sellerpage_url, sellerpage[0])
                     if (sellerpage_text):
                         # 连接数据库
                         db = DB_connection('localhost', 3306, 'root',
                                            '123123', 'All_Scraper', 'utf8')
                         conn = db.connects()
                         mapper = DB_operation(conn)
                         for i in sellerpage_text:
                             # 将数据插入jd_seller_page数据表中
                             sql_sellerpage_insert_1 = sellerpage_insert_sql_joint(
                                 i)
                             print sql_sellerpage_insert_1
                             mapper.insert(sql_sellerpage_insert_1)
                         # 关闭数据库连接
                         conn.commit()
                         conn.close()
     except Exception as err:
         print err
Esempio n. 6
0
 def PRODUCTS_SCRAPE(self):
     try:
         start_time = time.clock()
         # 从jd_keywords表里面获取需要的sku拼接url
         item_product_sku = get_product_url()
         item_product_sku_1 = item_product_sku.get_Product_Url()
         # 登录京东操作
         login = login_jd()
         login.login_JD()
         for product_sku in item_product_sku_1:
             print(product_sku[0])
             url = "https://item.jd.com/" + str(product_sku[0]) + ".html"
             # 根据url获取网页源码
             html = get_text()
             html_1 = html.get_html_text_1(url)
             try:
                 # if(html_1):
                 #     #解析操作
                 #     processor=products_analysis()
                 #     data_1=processor.process(html_1)
                 #     if(data_1):
                 #         for data in data_1:
                 #             for key,value in data.items():
                 #                 # 如果解析里有shop_id
                 #                 if (key == 'shop_id' and value != ""):
                 #                     # 找到数据库中看有没有这一条记录
                 #                     shopid_value = value
                 #                     result = products_update_select(shopid_value)
                 #                     # 有的话
                 #                     if (result):
                 #                         # 更新操作
                 #                         products_update_operation(data, shopid_value)
                 #                     else:
                 #                         # 插入操作
                 #                         products_insert_operation(data)
                 if (html_1):
                     # 解析操作
                     processor = products_analysis()
                     data = processor.process(html_1, product_sku[0])
                     # 连接数据库
                     db = DB_connection('localhost', 3306, 'root', '123123',
                                        'All_Scraper', 'utf8')
                     conn = db.connects()
                     mapper = DB_operation(conn)
                     if (data):
                         for data_1 in data:
                             # 插入到jd_products数据表中的数据
                             sql_jd_products = products_insert_sql_joint(
                                 data_1)
                             for img in data_1['image_url']:
                                 img_url = "https:" + img
                                 sql_jd_product_image = products_image_insert_sql_joint(
                                     data_1, img_url)
                                 # mapper.insert(sql_jd_product_image)
                             # mapper.insert(sql_jd_products)
                             print sql_jd_products
                             print sql_jd_product_image
                             mapper.insert(sql_jd_products)
                             mapper.insert(sql_jd_product_image)
                         conn.commit()
                         conn.close()
             except Exception as err:
                 print err
     except Exception as err:
         print err
     all_time = (time.clock() - start_time)
     print("total time is:", all_time)