def get_gacha_record_to_sql(num=5): db = pymysql.connect(host,username,password,database) hash_list = [] print("一共进行{}次爬取".format(str(num))) for i in range(num): valid_times = 0 time1 = time.time() print("正在进行第{}次爬取..".format(str(i+1))) sample_gacha_record = get_sample_gacha_record() record_hash = get_md5(json.dumps(sample_gacha_record)) if record_hash in hash_list: continue else: hash_list.append(record_hash) # 应判断item是否已经存在于数据库中,如何判断是否同一次出货? for item in sample_gacha_record: # print(item['uid']) if is_valid_data(db,item): valid_times += 1 insert_data(db,item) db.commit() print("第{}次爬取成功,本次共爬取有效数据{}条,用时{}秒".format(str(i+1),str(valid_times),str((time.time()-time1)))) next_time = int(60 * ((-valid_times/40)+5)) print("下次爬取时间为{}秒后".format(next_time)) time.sleep(next_time)
def do_request(url, oauth, carcel, payload, geocode): if payload != None: r = requests.get(url=url, auth=oauth, params=payload) data = r.json() else: r = requests.get(url=url, auth=oauth) data = r.json() try: next_results = data['search_metadata']['next_results'] print next_results except: print "There are not next_results" next_results = "None" for status in data['statuses']: obj = {} obj['carcel'] = carcel status_id = status['id'] obj['status_id'] = status_id text = status['text'] obj['text'] = text screen_name = status['user']['screen_name'] obj['screen_name'] = screen_name profile_img_url = status['user']['profile_image_url_https'] download_profile_image(profile_img_url, screen_name) utc_offset = status['user']['utc_offset'] obj['utc_offset'] = utc_offset user_id = status['user']['id'] obj['user_id'] = user_id created_at = status['created_at'] obj['created_at'] = created_at if 'geo' in status and status['geo'] != None: latitude = status['geo']['coordinates'][0] longitude = status['geo']['coordinates'][1] obj['latitude'] = latitude obj['longitude'] = longitude obj['retuited'] = "no" obj['in_jail'] = "no" if calc_distance(latitude, longitude, float(geocode.split(",")[0]), float(geocode.split(",")[1])) < 1.1: print obj['status_id'], calc_distance( latitude, longitude, float(geocode.split(",")[0]), float(geocode.split(",")[1])) insert_data(obj) return next_results
def do_request(url, oauth, carcel, payload, geocode): if payload != None: r = requests.get(url=url, auth=oauth, params=payload) data = r.json() else: r = requests.get(url=url, auth=oauth) data = r.json() try: next_results = data['search_metadata']['next_results'] print next_results except: print "There are not next_results" next_results = "None" for status in data['statuses']: obj = {} obj['carcel'] = carcel status_id = status['id'] obj['status_id'] = status_id text = status['text'] obj['text'] = text screen_name = status['user']['screen_name'] obj['screen_name'] = screen_name profile_img_url = status['user']['profile_image_url_https'] download_profile_image(profile_img_url, screen_name) utc_offset = status['user']['utc_offset'] obj['utc_offset'] = utc_offset user_id = status['user']['id'] obj['user_id'] = user_id created_at = status['created_at'] obj['created_at'] = created_at if 'geo' in status and status['geo'] != None: latitude = status['geo']['coordinates'][0] longitude = status['geo']['coordinates'][1] obj['latitude'] = latitude obj['longitude'] = longitude obj['retuited'] = "no" if calc_distance(latitude, longitude, float(geocode.split(",")[0]), float(geocode.split(",")[1])) < 1.1: print obj['status_id'], calc_distance(latitude, longitude, float(geocode.split(",")[0]), float(geocode.split(",")[1])) insert_data(obj) return next_results
def run(username, flow): # data = lib.get_mixedtables(username, flow) # return jsonify( # data=data, # username=username, # flow=flow) data = [(u'news:5', 1), (u'norris:6', 2)] print(data) simpleList = [] for line in data: action_data = lib.run_action(line) simpleList.append(action_data) return jsonify(data=simpleList, username=username, flow=flow, time=datetime.datetime.now()) newslist = [] for line in news: list_data = lib.insert_news(line) newslist.append[news_data] norrislist = [] for line in norris: list_data = lib.insert_data(line) norrislist.append[norris_data]
import lib if len(sys.argv) < 3: print "This script inserts 1 tuit into our database using the status_id value and a string for the jail" print "Usage python insert_tuit.py 123208309281908 'Penal Diroes'" sys.exit() oauth = api.get_oauth() # get tuit data url = "https://api.twitter.com/1.1/statuses/show.json" payload = {'id': sys.argv[1].strip()} r = requests.get(url=url, auth=oauth, params=payload) data = r.json() obj = {} obj['carcel'] = unicode(sys.argv[2].strip(), "utf-8") obj['created_at'] = data['created_at'] obj['screen_name'] = data['user']['screen_name'] obj['status_id'] = data['id'] obj['text'] = data['text'] obj['user_id'] = data['user']['id'] obj['utc_offset'] = data['user']['utc_offset'] coords = data['geo']['coordinates'] obj['latitude'] = coords[0] obj['longitude'] = coords[1] lib.insert_data(obj)