def temp(table): sql = 'select shop_id from computer where shop_id is not null' result = list(database_util.search_sql(sql, None))[1] for i in result: shop_id = i[0] count = list( database_util.search_sql( 'select count(*) from shop where shop_id=%s', shop_id)[1])[0][0] if count == 0: database_util.update_sql('insert into shop(shop_id) values(%s)', shop_id)
def get_shop_id(thread_name, queue, table): while not EXIT_FLAG: QUEUE_LOCK.acquire() if not WORK_QUEUE.empty(): try: sku = queue.get() QUEUE_LOCK.release() url = 'https://item.m.jd.com/product/' + sku + '.html' print(thread_name, url) _spider = jd_spider.Spider() html_data = _spider.get_html(url) if html_data[0] != -1: result = html_analysis.get_shop_id(html_data[1]) else: pass if result[0] != -1: shop_id = result[1] print("%s: shop_id %s" % (thread_name, shop_id)) sql = 'update ' + table + ' set shop_id=%s where sku=%s ' data = [shop_id, sku] database_util.update_sql(sql, data) count = list( database_util.search_sql( 'select count(*) from shop where shop_id=%s', shop_id)[1])[0][0] if count == 0: database_util.update_sql( 'insert into shop(shop_id) values(%s)', shop_id) except Exception as err: print(err) # print('thread_queue get_shop_id err:' + str(err)) else: QUEUE_LOCK.release() time.sleep(1)
def get_brand(table): dictionary = FILE_PATH + 'train_files/dictionary.txt' file = open(dictionary, "a", encoding='utf-8') brand1 = '' brand2 = '' try: sql = 'select distinct brand from ' + table result = list(database_util.search_sql(sql, None)) if result[0] == -1: return for j in result[1]: if len(j[0]) == 0: continue line = j[0].strip() if line.find('(') >= 0: brand1 = line.split('(')[0] brand2 = line.split('(')[1] brand2 = brand2.split(')')[0] file.write(brand1 + '\n' + brand2 + '\n') print(brand1, brand2) except Exception as err: print('tran_util get_brand err: %s' % (str(err))) finally: file.close() file_util.del_duplicate(dictionary)
def update_shop_info(table): sql = 'SELECT shop_id FROM shop where TO_DAYS(NOW()) - TO_DAYS(update_time) >=1' result = database_util.search_sql(sql, None) shop_id = [] if result[0] != -1: id = list(result[1]) for i in id: shop_id.append(i[0]) thread_queue.fill_queue(shop_id) thread_queue.use_threading(['update_shop_info', table]) sql = 'select brand,follow from ' + table + ' group by brand order by follow' result = database_util.search_sql(sql, None) if result[0] != -1: result = list(result[1]) for i in result: sql = 'update ' + table + ' set brand_hot=%s where brand=%s' data = [i[1], i[0]] database_util.update_sql(sql, data)
def get_sku(table): sql = 'select url,id from ' + table result = database_util.search_sql(sql, None) if result[0] != -1: result = list(result[1]) for i in result: sku = i[0].strip('https://item.jd.com/').strip('.html') id = i[1] sql = 'update ' + table + ' set sku=%s where id=%s' database_util.update_sql(sql, [sku, id])
def get_sentiment_score(table): sql = 'select sku from ' + table + ' where update_unreal_time is not null' result = database_util.search_sql(sql, None) sku_list = [] if result[0] != -1: times = list(result[1]) for i in times: sku_list.append(i[0]) thread_queue.fill_queue(sku_list) thread_queue.use_threading(['get_sentiment_score', table])
def unify_brand(table): sql = 'select sku,brand from ' + table + ' where brand=%s' result = database_util.search_sql(sql, '360手机') if result[0] != -1: result = list(result[1]) for i in result: sql = 'update ' + table + ' set brand=%s where sku=%s' sku = i[0] data = ['360', sku] database_util.update_sql(sql, data) print(i[0])
def get_param(table): # 把店铺关注人数少的商品删掉 sql = 'delete from '+table+' where sku in (select a.sku from (select a.sku from '+table+' a,shop b where a.shop_id=b.shop_id and b.follow<10000) a)' database_util.update_sql(sql,None) sql = 'SELECT url FROM '+table+' where update_time is null'; result = list(database_util.search_sql(sql, None)[1]) url_list = [] for i in result: url_list.append(i[0]) thread_queue.fill_queue(url_list) thread_queue.use_threading(['get_param',table])
def get_shop_id(table): sql = 'SELECT sku FROM ' + table + ' where shop_id is null' result = database_util.search_sql(sql, None) sku = [] if result[0] != -1: id = list(result[1]) for i in id: if i[0] is not None: sku.append(i[0]) else: print("sku is null") thread_queue.fill_queue(sku) thread_queue.use_threading(['get_shop_id', table])
def get_shop_info(): sql = 'SELECT shop_id FROM shop where update_time is null'; result = database_util.search_sql(sql, None) shop_id = [] if result[0]!=-1: id = list(result[1]) for i in id: if i[0] is not None: shop_id.append(i[0]) else: print("shop_id is null") thread_queue.fill_queue(shop_id) thread_queue.use_threading(['update_shop_info',table])
def del_file(table): path_list = [ DATA_PATH + table + '/item_comments/', DATA_PATH + table + '/useful_comments/' ] for file_path in path_list: for sku_name in os.listdir(file_path): sku = sku_name[0:sku_name.find('.')] sql = 'select shop_name from ' + table + ' where sku=%s' result = database_util.search_sql(sql, sku) if result[0] != -1: if len(result[1]) == 0: print('deleted sku:%s' % (sku))
def get_comment(table): # sql = 'SELECT sku FROM '+table+ ' where follow>=10000 and comment>=3000 and comment<5000'; sql = 'SELECT sku FROM ' + table + ' where update_comment_time is null' result = database_util.search_sql(sql, None) sku = [] if result[0] != -1: id = list(result[1]) for i in id: if i[0] is not None: sku.append(i[0]) else: print("sku is null") thread_queue.fill_queue(sku) #第三个参数是要获取多少页的评论数据 thread_queue.use_threading(['get_comment', table, 100])
def update_price(table): sql = 'SELECT sku,max_price,min_price,avg_price,price_times FROM ' + table + ' where TO_DAYS(NOW()) - TO_DAYS(update_price_time) >=1' result = database_util.search_sql(sql, None) prices = [] if result[0] != -1: times = list(result[1]) for i in times: price = {} price['sku'] = i[0] price['max_price'] = float(i[1]) price['min_price'] = float(i[2]) price['avg_price'] = float(i[3]) price['price_times'] = int(i[4]) prices.append(price) thread_queue.fill_queue(prices) thread_queue.use_threading(['update_price', table])
def update_img(table): # https://img11.360buyimg.com/n5/s54x54_jfs/t5773/143/1465870132/216483/4bbce005/592692d8Nbcc8f248.jpg # https://img10.360buyimg.com/n7/jfs/t18772/89/1863054684/170815/d28ecae1/5adca3deN76bb61cb.jpg sql = 'select img,sku from ' + table result = database_util.search_sql(sql, None) if result[0] != -1: imgs = list(result[1]) for i in imgs: img = i[0] sku = i[1] print(img) new_img = img.replace('n5/s54x54_jfs', 'n7/jfs') print(new_img + '\n') sql = 'update ' + table + ' set img=%s where sku=%s' data = [new_img, sku] database_util.update_sql(sql, data)
def insert_url(thread_name, queue, table): while not EXIT_FLAG: QUEUE_LOCK.acquire() if not WORK_QUEUE.empty(): try: url = queue.get() QUEUE_LOCK.release() count = list( database_util.search_sql( 'select count(*) url from ' + table + ' where url=%s', url)[1])[0][0] if count == 0: sql = 'insert into ' + table + ' set url=%s' database_util.update_sql(sql, url) except Exception as err: print('thread_queue update_price err:' + str(err)) else: QUEUE_LOCK.release() time.sleep(1)
def update_score(thread_name, queue, table, para): while not EXIT_FLAG: QUEUE_LOCK.acquire() if not WORK_QUEUE.empty(): try: sku = queue.get() QUEUE_LOCK.release() w_rate = para['w_rate'] w_follow = para['w_follow'] w_comment = para['w_comment'] w_sentiment = para['w_sentiment'] w_brand = para['w_brand'] sql = 'select sku,rate,follow,comment,sentiment,brand_hot from ' + table + ' where sku=%s' result = database_util.search_sql(sql, sku) if result[0] != -1: result = list(result[1]) for i in result: sku = i[0] rate = float(i[1]) * 100 follow = int(i[2]) comment = int(i[3]) sentiment = int(i[4]) brand_hot = int(i[5]) score = round( (rate * w_rate + follow * w_follow + comment * w_comment + sentiment * w_sentiment + brand_hot * w_brand), 2) sql = 'update ' + table + ' set score=%s where sku=%s' data = [score, sku] database_util.update_sql(sql, data) except Exception as err: print('thread_queue update_score err:' + str(err)) else: QUEUE_LOCK.release() time.sleep(1)