def add_user(): if request.method == 'POST': key_list = ['AccountName', 'Password', 'UserName', 'RechargeMoney', \ 'RechargeMoney', 'Mobile', 'IDCard', 'Star'] item = [request.form[key] for key in key_list] print MysqlManager.insert_user(item) return "ok"
def process_item(self, item, spider): if 'image_urls' in item: dir_path = '{0}/../../LofterImageSets'.format(FILE_PATH) if not os.path.exists(dir_path): os.makedirs(dir_path) for image_url in item['image_urls']: image_name = image_url.split('/')[-1] file_path = '%s/%s' % (dir_path, image_name) if os.path.exists(file_path): continue #database operation category_list = item['image_category'] category=[] for cat in category_list: category.append(cat.strip('#')) category_tag = ','.join(category) image_set_name = item['query_url'][0].split('/')[-1] description = item['description'][0].encode('utf-8') field_item = [(image_set_name,image_name,category_tag,description,image_url,item['query_url'][0])*2] MysqlManager.insert_items_into_photos(field_item) with open(file_path, 'wb') as handle: response = requests.get(image_url, stream=True) for block in response.iter_content(1024): if not block: break handle.write(block) return item
def get_server_stream(): result = MysqlManager.get_trade_stream(["server"]) return ujson.dumps(result, ensure_ascii=False)
def chat(ws): users[ws.id] = ws print ws.id while True: msg = ws.receive() if msg is not None: print msg typex, obj, content = "", "", "" try: typex, obj, content = msg.split(',') except: pass if typex == '1' and obj == 'user': global user_match_dict user_match_dict[ws.id] = content user = pass_users[content] total_trade = [user[0], user[2], user[3], user[3] - user[2]] msg = ujson.dumps({'total_trade': total_trade}, ensure_ascii=False) users[ws.id].send(msg) if g_is_open: if typex == "2": now_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) user_trade_info[ws.id] = [ now_time, pass_users[user_match_dict[ws.id]][3], cur_price, cur_price, 0 ] trade_stream = [ now_time, pass_users[user_match_dict[ws.id]][0], typex, "user", cur_price, "1" ] MysqlManager.insert_stream_trade(trade_stream) trade_stream = [ now_time, pass_users[user_match_dict[ws.id]][0], typex, "server", cur_price, "1" ] MysqlManager.insert_stream_trade(trade_stream) msg = ujson.dumps({'single_trade': user_trade_info[ws.id]}, ensure_ascii=False) users[ws.id].send(msg) if typex == "3": now_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) user_trade_info[ws.id] = [ now_time, pass_users[user_match_dict[ws.id]][3], cur_price, cur_price, 0 ] trade_stream = [ now_time, pass_users[user_match_dict[ws.id]][0], typex, "user", cur_price, "1" ] MysqlManager.insert_stream_trade(trade_stream) trade_stream = [ now_time, pass_users[user_match_dict[ws.id]][0], typex, "server", cur_price, "1" ] MysqlManager.insert_stream_trade(trade_stream) msg = ujson.dumps({'single_trade': user_trade_info[ws.id]}, ensure_ascii=False) users[ws.id].send(msg) if typex == "4": MysqlManager.update_item( [user_trade_info[ws.id][-1], user_match_dict[ws.id]]) now_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) trade_stream = [ now_time, pass_users[user_match_dict[ws.id]][0], typex, "user", cur_price, "1" ] MysqlManager.insert_stream_trade(trade_stream) trade_stream = [ now_time, pass_users[user_match_dict[ws.id]][0], typex, "server", cur_price, "1" ] MysqlManager.insert_stream_trade(trade_stream) user = MysqlManager.get_user_by_name( [user_match_dict[ws.id]])[1:] total_trade = [ user[0], user[2], user[3], user[3] - user[2] ] msg = ujson.dumps({'total_trade': total_trade}, ensure_ascii=False) users[ws.id].send(msg) del user_trade_info[ws.id] else: break del users[ws.id] if ws_id in user_trade_info: del user_trade_info[ws.id]
def get_admin(): return MysqlManager.get_admin()[1:]
import json from mysql_manager import MysqlManager mysql = MysqlManager(4) with open('videos.json', 'r') as f: i = 1 while True: print("Parse json: ", i) i+= 1 line = f.readline() if not line: break if len(line) < 10: continue # urls = re.findall('http://v3-dy.ixigua.com[^\"]+', json_str) obj = json.loads(line) # aweme_list->[n]->video->play_addr->url_list i_url = 0 for v in obj['aweme_list']: # print("-----", i_url) try: url = v['video']['play_addr']['url_list'][0] except Exception as err: print("parse error ", i, " index: ", i_url) i_url += 1 # print(url)
import re from lxml import etree import requests import time import html from pic_downloader import pic_downloader from mysql_manager import MysqlManager mysql_mgr = MysqlManager(4) class PostsCrawler: domain = 'https://www.newsmth.net' pattern = re.compile('<.*?>') def get_content(self, topic_url, page): querystring = {"ajax": "", "p": str(page)} url = self.domain + topic_url r = requests.get(url, params=querystring) self.html = r.text pic_downloader().get_media_files(r.text) self.tree = etree.HTML(r.text) time.sleep(1) def get_max_page(self): pages = self.tree.xpath('//ol[@class="page-main"][1]/li')
class WeiboCrawler(): cookie_filename = 'cookie' data_dir = './data' login_url = "https://passport.weibo.cn/sso/login" payload = "username={}&password={}&savestate=1&mainpageflag=1&entry=mweibo&ec=0".format( '18600663368', 'Xi@oxiang66') login_headers = { 'origin': "https://passport.weibo.cn", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36", 'content-type': "application/x-www-form-urlencoded", 'accept': "*/*", 'referer': "https://passport.weibo.cn/signin/login", 'accept-encoding': "gzip, deflate, br", 'accept-language': "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", 'cache-control': "no-cache" } post_url = 'https://m.weibo.cn/detail/{}' reply_url_0 = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id_type=0' reply_url_1 = 'https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id={}&max_id_type=0' comments = [] pattern = re.compile('<.*?>') def __init__(self, limit=500): self.reply_limit = limit self.mm = MysqlManager(4) def cookie_exist(self): return os.path.isfile(self.cookie_filename) def cookie_valid(self): cookie_mid_time = os.path.getmtime(self.cookie_filename) return cookie_mid_time + 86400 * 2 > time.time() def load_cookie(self): with open(self.cookie_filename, 'r') as f: cookie = f.read() self.login_headers['cookie'] = cookie return cookie def do_login(self): response = requests.post(self.login_url, data=self.payload, headers=self.login_headers, allow_redirects=False) cookie = '' for k, v in response.cookies.iteritems(): cookie += k + '=' + v + ';' cookie = cookie[:-1] with open(self.cookie_filename, 'w') as f: f.write(cookie) login_headers['cookie'] = cookie def login(self): # Check whether cookie is existed and valid if self.cookie_exist() and self.cookie_valid(): cookie = self.load_cookie() return # Call login API, login and save cookie self.do_login() def assure_data_dir(self): if not os.path.exists(self.data_dir): os.makedirs(self.data_dir) def cleanup_text(self, text): return self.pattern.sub('', text) def save_data(self, filename, data): self.assure_data_dir() with open(self.data_dir + '/{}.json'.format(filename), 'w') as f: f.write(data) def extract_var(self, html): return re.findall( r'var\s\$render_data\s=\s(\[[\s\S]*\])\[0\]\s\|\|\s\{\}\;', html)[0] # Wed Jan 16 00:00:52 +0800 2019 # 2019-01-16 00:00:52 def convert_time_format(self, ts): return datetime.datetime.strptime( ts, "%a %b %d %H:%M:%S %z %Y").strftime('%Y-%m-%d %H:%M:%S') def get_post(self, id): url = self.post_url.format(id) response = requests.get(url, headers=self.login_headers) post_data_str = self.extract_var(response.text) post_data = json.loads(post_data_str)[0]['status'] self.post = {} print(post_data['created_at']) print(self.convert_time_format(post_data['created_at'])) self.post['id'] = post_data['id'] self.post['created_at'] = self.convert_time_format( post_data['created_at']) self.post['text'] = self.cleanup_text(post_data['text']) self.post['reposts_count'] = post_data['reposts_count'] self.post['comments_count'] = post_data['comments_count'] self.post['attitudes_count'] = post_data['attitudes_count'] post_data_user = post_data['user'] self.post['profile_image_url'] = post_data_user['profile_image_url'] self.post['user_id'] = post_data_user['id'] self.post['screen_name'] = post_data_user['screen_name'] self.save_data(self.post['id'], post_data_str) self.mm.insert_data('post', self.post) post_pics = pic_downloader().get_media_files(post_data['pics']) for pic in post_pics: p = {} p['post_id'] = id p['url'] = pic self.mm.insert_data('pic', p) def get_comments(self, id, max_id): if max_id == 0: url = self.reply_url_0.format(id, id) else: url = self.reply_url_1.format(id, id, max_id) response = requests.get(url, headers=self.login_headers) reply_json_obj = json.loads(response.text) reply_data = reply_json_obj['data']['data'] comment = {} for r in reply_data: comment['created_at'] = self.convert_time_format(r['created_at']) comment['id'] = r['id'] comment['post_id'] = id comment['text'] = self.cleanup_text(r['text']) r_data_user = r['user'] comment['profile_image_url'] = r_data_user['profile_image_url'] comment['user_id'] = r_data_user['id'] comment['screen_name'] = r_data_user['screen_name'] self.comments.append(comment) self.mm.insert_data('comment', comment) self.save_data(self.post['id'] + '-{}'.format(max_id), response.text) if len(self.comments) >= reply_json_obj['data']['total_number']: return if self.reply_limit is not 0 and len(self.comments) > int( self.reply_limit): return time.sleep(2) self.get_comments(self.post['id'], reply_json_obj['data']['max_id'])
def modify_user_star(): if request.method == "POST": user_name = request.form["UserName"] star = request.form["Star"] print MysqlManager.user_modify_star(user_name, star) return "ok"
def delete_user(): if request.method == 'POST': user_name = request.form['UserName'] MysqlManager.user_delete(user_name) return 'ok'
def modify_user_star(): if request.method == 'POST': user_name = request.form['UserName'] star = request.form['Star'] print MysqlManager.user_modify_star(user_name, star) return 'ok'
def add_user_money(): if request.method == 'POST': user_name = request.form['UserName'] add_money = request.form['Money'] print MysqlManager.user_add_money(user_name, add_money) return 'ok'
def add_user(): if request.method == "POST": key_list = ["AccountName", "Password", "UserName", "RechargeMoney", "RechargeMoney", "Mobile", "IDCard", "Star"] item = [request.form[key] for key in key_list] print MysqlManager.insert_user(item) return "ok"
def add_user_money(): if request.method == "POST": user_name = request.form["UserName"] add_money = request.form["Money"] print MysqlManager.user_add_money(user_name, add_money) return "ok"
def get_users(): return dict([(item[1], item[1:]) for item in MysqlManager.get_users()])
def delete_user(): if request.method == "POST": user_name = request.form["UserName"] MysqlManager.user_delete(user_name) return "ok"
'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'accept-encoding': "gzip, deflate", 'accept-language': "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", 'range': "bytes=524288-524288", 'if-range': "\"9056E63E897E90A3BFB2619B86481603\"" } dirname = './dy_videos/' mysql = MysqlManager(4) # cur_url = 'http://v3-dy.ixigua.com/41774b984e4022ca52a4795be488dec9/5bb667a3/video/m/2200fd0bcf7114241858464e7ee8e62a2ef115bf46e00004b570145264e/' def download_video(index, url): file_name = url[url.rindex('/', 0, -1) + 1:-1] file_name = hashlib.md5(file_name.encode('utf8')).hexdigest() + '.mp4' print('Start downloading ', file_name) r = requests.get(url, stream=True) # download started with open(dirname + file_name, 'wb') as f: for chunk in r.iter_content(chunk_size=1024 * 1024): if chunk: f.write(chunk)
def __init__(self, limit=500): self.reply_limit = limit self.mm = MysqlManager(4)
headers = { 'host': "v3-dy.ixigua.com", 'connection': "keep-alive", 'cache-control': "no-cache", 'upgrade-insecure-requests': "1", 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'accept-encoding': "gzip, deflate", 'accept-language': "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", 'range': "bytes=524288-524288", 'if-range': "\"9056E63E897E90A3BFB2619B86481603\"" } dirname = './dy_videos/' mysql = MysqlManager(4) # cur_url = 'http://v3-dy.ixigua.com/41774b984e4022ca52a4795be488dec9/5bb667a3/video/m/2200fd0bcf7114241858464e7ee8e62a2ef115bf46e00004b570145264e/' async def download_coroutine(index, url): file_name = url[url.rindex('/', 0, -1) + 1: -1] file_name = hashlib.md5(file_name).hexdigest() + '.mp4' print('Start downloading ', file_name) r = requests.get(url, stream = True) # download started with open( dirname + file_name, 'wb') as f: for chunk in r.iter_content(chunk_size = 1024*1024): if chunk: f.write(chunk) msg = 'Finished downloading %s'%(file_name)
import re from lxml import etree import requests import time import global_var from mysql_manager import MysqlManager mysql_mgr = MysqlManager(4) class BoardsCrawler: domain = 'http://www.newsmth.net/' base_url = domain + '/nForum/section/{}?ajax' def __init__(self, interval=1): self.interval = interval def get_board_of_section(self, section_idx): url = self.base_url.format(section_idx) response = requests.get(url, headers=global_var.newsmth_headers) time.sleep(self.interval) self.content = response.text self.tree = etree.HTML(self.content) def get_board_list(self, etr_obj=None): if etr_obj is None: etr_obj = self.tree elements = etr_obj.xpath( '//table[@class="board-list corner"]/tbody/tr')
def __init__(self, limit=200): self.reply_limit = limit self.mm = MysqlManager(4) self.post = {}
import re from lxml import etree import requests from threading import Thread import time import html from mysql_manager import MysqlManager from crawler import PostsCrawler max_threads = 10 wait_duration = 20 mysql_mgr = MysqlManager(10) def post_crawl_task(topic): # Get 1st page of this topic post_crawler = PostsCrawler() post_crawler.get_content(topic['url'], 1) posts = post_crawler.get_posts() # Get number of pages of this topic page_count = post_crawler.get_max_page() # Get the rest posts of this topic if page_count > 1: for i in range(2, page_count + 1): post_crawler.get_content(topic['url'], i) posts += post_crawler.get_posts() break # Insert post of a topic
import re from lxml import etree import requests import time from threading import Thread from crawler import PostsCrawler from mysql_manager import MysqlManager max_threads = 10 interval = 20 mysql_mgr = MysqlManager(max_threads) def post_crawl_task(topic): # Get 1st page of this topic post_crawler = PostsCrawler() post_crawler.get_content(topic['url'], 1) posts = post_crawler.get_posts() # Get number of pages of this topic page_count = post_crawler.get_max_page() print(topic['url']) print('page count', page_count) # Get the rest posts of this topic if page_count > 1: for i in range(2, page_count + 1): post_crawler.get_content(topic['url'], i) posts += post_crawler.get_posts()
def chat(ws): users[ws.id] = ws print ws.id while True: msg = ws.receive() if msg is not None: print msg typex, obj, content = "", "", "" try: typex, obj, content = msg.split(",") except: pass if typex == "1" and obj == "user": global user_match_dict user_match_dict[ws.id] = content user = pass_users[content] total_trade = [user[0], user[2], user[3], user[3] - user[2]] msg = ujson.dumps({"total_trade": total_trade}, ensure_ascii=False) users[ws.id].send(msg) if g_is_open: if typex == "2": now_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) user_trade_info[ws.id] = [now_time, pass_users[user_match_dict[ws.id]][3], cur_price, cur_price, 0] trade_stream = [now_time, pass_users[user_match_dict[ws.id]][0], typex, "user", cur_price, "1"] MysqlManager.insert_stream_trade(trade_stream) trade_stream = [now_time, pass_users[user_match_dict[ws.id]][0], typex, "server", cur_price, "1"] MysqlManager.insert_stream_trade(trade_stream) msg = ujson.dumps({"single_trade": user_trade_info[ws.id]}, ensure_ascii=False) users[ws.id].send(msg) if typex == "3": now_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) user_trade_info[ws.id] = [now_time, pass_users[user_match_dict[ws.id]][3], cur_price, cur_price, 0] trade_stream = [now_time, pass_users[user_match_dict[ws.id]][0], typex, "user", cur_price, "1"] MysqlManager.insert_stream_trade(trade_stream) trade_stream = [now_time, pass_users[user_match_dict[ws.id]][0], typex, "server", cur_price, "1"] MysqlManager.insert_stream_trade(trade_stream) msg = ujson.dumps({"single_trade": user_trade_info[ws.id]}, ensure_ascii=False) users[ws.id].send(msg) if typex == "4": MysqlManager.update_item([user_trade_info[ws.id][-1], user_match_dict[ws.id]]) now_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) trade_stream = [now_time, pass_users[user_match_dict[ws.id]][0], typex, "user", cur_price, "1"] MysqlManager.insert_stream_trade(trade_stream) trade_stream = [now_time, pass_users[user_match_dict[ws.id]][0], typex, "server", cur_price, "1"] MysqlManager.insert_stream_trade(trade_stream) user = MysqlManager.get_user_by_name([user_match_dict[ws.id]])[1:] total_trade = [user[0], user[2], user[3], user[3] - user[2]] msg = ujson.dumps({"total_trade": total_trade}, ensure_ascii=False) users[ws.id].send(msg) del user_trade_info[ws.id] else: break del users[ws.id] if ws_id in user_trade_info: del user_trade_info[ws.id]