def urls_add(self): add_url_list = list(set(self.__request_json['urls_add'])) # 去重 # 已存在queue中的 exist_queue_url_list = [] res = Mongo.get()['queue_' + self.project_name].find({'url_md5': {'$in': [md5(l) for l in add_url_list]}}, {'url': 1}) for doc in res: exist_queue_url_list.append(doc['url']) # 已存在parsed中的 exist_parsed_url_list = [] res = Mongo.get()['parsed_' + self.project_name].find({'url_md5': {'$in': [md5(l) for l in add_url_list]}}, {'url': 1}) for doc in res: # todo 需要判断存在的时间, 允许重复抓取 exist_parsed_url_list.append(doc['url']) # 加入队列 add_urls_data = [] for url in add_url_list: if url not in exist_queue_url_list and url not in exist_parsed_url_list: # 不存在queue不存在parsed中才加入队列 add_urls_data.append( {'domain': get_domain(url), 'url': url, 'url_md5': md5(url), 'flag_time': 0, 'add_time': int(time.time()), 'slave_ip': self.__request_address[0]}) add_urls_data and Mongo.get()['queue_' + self.project_name].insert(add_urls_data)
def toggle_project(project_id): try: project = Mongo.get()["projects"].find_one({"_id": ObjectId(project_id)}) if not project: raise Exception("不存在的记录!") except: return jsonify({"success": False, "msg": "不存在的记录!"}) project["static"] = "抓取中" if project["static"] == "暂停中" else "暂停中" Mongo.get()["projects"].update({"_id": ObjectId(project_id)}, {"$set": {"static": project["static"]}}) return jsonify({"success": True, "msg": "切换成功!"})
def get_projects(): project_dict = {} for project in get_project_list(): project_dict[project['name']] = { '_id': str(project['_id']), 'name': project['name'], 'static': project['static'], 'queue_len': Mongo.get()['queue_' + project['name']].count(), 'parsed_len': Mongo.get()['parsed_' + project['name']].count(), 'result_len': Mongo.get()['result_' + project['name']].count(), } return jsonify(project_dict)
def get_projects(): project_dict = {} for project in get_project_list(): project_dict[project["name"]] = { "_id": str(project["_id"]), "name": project["name"], "static": project["static"], "queue_len": Mongo.get()["queue_" + project["name"]].count(), "parsed_len": Mongo.get()["parsed_" + project["name"]].count(), "result_len": Mongo.get()["result_" + project["name"]].count(), } return jsonify(project_dict)
def toggle_project(project_id): try: project = Mongo.get()['projects'].find_one({'_id': ObjectId(project_id)}) if not project: raise Exception('不存在的记录!') except: return jsonify({'success': False, 'msg': '不存在的记录!'}) project['static'] = '抓取中' if project['static'] == '暂停中' else '暂停中' Mongo.get()['projects'].update({'_id': ObjectId(project_id)}, {'$set': {'static': project['static']}}) return jsonify({'success': True, 'msg': '切换成功!'})
def get_project_tasks(project_name): res = [] for doc in Mongo.get()['parsed_' + project_name].find().sort('_id', -1).limit(100): del doc['_id'] doc['slave_ip'] = mix_ip(doc['slave_ip']) res.append(doc) return json.dumps(res)
def toggle_project(project_id): try: project = Mongo.get()['projects'].find_one( {'_id': ObjectId(project_id)}) if not project: raise Exception('不存在的记录!') except: return jsonify({'success': False, 'msg': '不存在的记录!'}) project['static'] = '抓取中' if project['static'] == '暂停中' else '暂停中' Mongo.get()['projects'].update({'_id': ObjectId(project_id)}, {'$set': { 'static': project['static'] }}) return jsonify({'success': True, 'msg': '切换成功!'})
def get_project_tasks(project_name): res = [] for doc in Mongo.get()["parsed_" + project_name].find().sort("_id", -1).limit(100): del doc["_id"] doc["slave_ip"] = mix_ip(doc["slave_ip"]) res.append(doc) return json.dumps(res)
def save_project(): return jsonify({'success': False, 'msg': '禁止修改!'}) form_data = json.loads(request.data) # todo 需要验证表单数据 name_r = re.compile(r'^[0-9a-zA-Z_-]+$') if not name_r.search(form_data['name']): return jsonify({'success': False, 'msg': '计划名称必须满足正则规则: ^[0-9a-zA-Z_-]+$ '}) exists_project = list(Mongo.get()['projects'].find({'name': form_data['name']}, {'_id': 1, 'add_time': 1}).limit(1)) if 'edit' not in form_data and exists_project: return jsonify({'success': False, 'msg': '计划名称已经存在!'}) # 新增计划或更新计划 data = { 'name': form_data['name'], 'init_url': form_data['init_url'], 'desc': form_data['desc'] if 'desc' in form_data else '', 'code': form_data['code'], 'static': '暂停中', 'update_time': int(time.time()), 'add_time': exists_project[0]['add_time'] if exists_project else int(time.time()), } Mongo.get()['projects'].update({'name': form_data['name']}, data, True) # 当是新计划时的初始化 if 'edit' not in form_data: Mongo.get()['queue_' + form_data['name']].insert( { 'url': form_data['init_url'], 'url_md5': md5(form_data['init_url']), 'flag_time': 0, 'add_time': int(time.time()), 'slave_ip': '0.0.0.0' }) # 在没创建集合前设置索引mongodb会自动创建该集合并赋索引 Mongo.get()['parsed_' + form_data['name']].ensure_index('url_md5', unique=True) Mongo.get()['queue_' + form_data['name']].ensure_index('url_md5', unique=True) # 有新计划加入, 重启全部slave restart_slave_list = GlobalHelper.get('restart_slave_list') or [] for slave_record in Mongo.get()['slave_record'].find(): restart_slave_list.append(slave_record['ip']) GlobalHelper.set('restart_slave_list', list(set(restart_slave_list))) return jsonify({'success': True, 'msg': '保存成功!'})
def save_project(): form_data = json.loads(request.data) # todo 需要验证表单数据 name_r = re.compile(r"^[0-9a-zA-Z_-]+$") if not name_r.search(form_data["name"]): return jsonify({"success": False, "msg": "计划名称必须满足正则规则: ^[0-9a-zA-Z_-]+$ "}) exists_project = list(Mongo.get()["projects"].find({"name": form_data["name"]}, {"_id": 1, "add_time": 1}).limit(1)) if "edit" not in form_data and exists_project: return jsonify({"success": False, "msg": "计划名称已经存在!"}) # 新增计划或更新计划 data = { "name": form_data["name"], "init_url": form_data["init_url"], "desc": form_data["desc"] if "desc" in form_data else "", "code": form_data["code"], "static": "暂停中", "update_time": int(time.time()), "add_time": exists_project[0]["add_time"] if exists_project else int(time.time()), } Mongo.get()["projects"].update({"name": form_data["name"]}, data, True) # 当是新计划时的初始化 if "edit" not in form_data: Mongo.get()["queue_" + form_data["name"]].insert( { "url": form_data["init_url"], "url_md5": md5(form_data["init_url"]), "flag_time": 0, "add_time": int(time.time()), "slave_ip": "0.0.0.0", } ) # 在没创建集合前设置索引mongodb会自动创建该集合并赋索引 Mongo.get()["parsed_" + form_data["name"]].ensure_index("url_md5", unique=True) Mongo.get()["queue_" + form_data["name"]].ensure_index("url_md5", unique=True) # 有新计划加入, 重启全部slave restart_slave_list = GlobalHelper.get("restart_slave_list") or [] for slave_record in Mongo.get()["slave_record"].find(): restart_slave_list.append(slave_record["ip"]) GlobalHelper.set("restart_slave_list", list(set(restart_slave_list))) return jsonify({"success": True, "msg": "保存成功!"})
def __storage_record(self): for ip, data in self.slave_record.items(): res = Mongo.get().slave_record.update( {'ip': ip}, {'ip': ip, 'data': data}, True) # 有着更新, 无则插入 if not res['updatedExisting'] and 'upserted' in res: # 插入时 self.slave_record[ip]['_id'] = str(res['upserted'])
def get_project_tasks(project_name): res = [] for doc in Mongo.get()['parsed_' + project_name].find().sort( '_id', -1).limit(100): del doc['_id'] doc['slave_ip'] = mix_ip(doc['slave_ip']) res.append(doc) return json.dumps(res)
def __init__(self, project_name, request_json, request_address): self.project_name = project_name temp = list( Mongo.get()['projects'].find({'name': self.project_name}).limit(1)) self.project = temp[0] if temp else {} self.__request_json = request_json self.__request_address = request_address self.__slave_record = SlaveRecord.get_instance()
def get_project_list(): """ todo 测试是否支持中文 :return: """ # get_project_list.cache = [v for v in Mongo.get().projects.find({}, {'_id': 0})] get_project_list.cache = [v for v in Mongo.get().projects.find()] return get_project_list.cache
def get_json_results(project_name): res = make_response( json.dumps( list(Mongo.get()['result_' + project_name].find({}, {'_id': 0}))), 200) # header("Content-Disposition:attachment;filename={$fileName}"); res.headers['Content-Disposition'] = 'attachment' res.headers['filename'] = project_name + '/json' return res
def get_slave_tasks(slave_id): res = [] try: slave_record = Mongo.get()['slave_record'].find_one({'_id': ObjectId(slave_id)}) if not slave_record: raise Exception('不存在的记录!') except: return json.dumps(res) for project in get_project_list(): for doc in Mongo.get()['parsed_' + project['name']].find({'slave_ip': slave_record['ip']}).sort('_id', -1).limit(20): del doc['_id'] doc['slave_ip'] = mix_ip(doc['slave_ip']) res.append(doc) res.sort(key=lambda x: x['add_time'], reverse=True) return json.dumps(res)
def toggle_slave(slave_id): try: slave_record = Mongo.get()['slave_record'].find_one({'_id': ObjectId(slave_id)}) if not slave_record: raise Exception('不存在的记录!') except: return jsonify({'success': False, 'msg': '不存在的记录!'}) slave_record['data']['static'] = '抓取中' if slave_record['data']['static'] == '暂停中' else '暂停中' try: Mongo.get()['slave_record'].update({'_id': ObjectId(slave_id)}, {'$set': {'data.static': slave_record['data']['static']}}) global_salve_record = GlobalHelper.get('salve_record') global_salve_record[slave_record['ip']]['static'] = slave_record['data']['static'] GlobalHelper.set('salve_record', global_salve_record) except: print traceback.format_exc() return jsonify({'success': True, 'msg': '切换成功!'})
def get_results(project_name, page=1): res = [] pagination = paginate(Mongo.get()['result_' + project_name].find().sort('_id', -1), page, 20) if pagination: for doc in pagination.result(): del doc['_id'] res.append(doc) return json.dumps({'result': res, 'render_json': pagination.render_json(10)})
def get_results(project_name, page=1): res = [] pagination = paginate(Mongo.get()["result_" + project_name].find().sort("_id", -1), page, 20) if pagination: for doc in pagination.result(): del doc["_id"] res.append(doc) return json.dumps({"result": res, "render_json": pagination.render_json(10)})
def __storage_record(self): for ip, data in self.slave_record.items(): res = Mongo.get().slave_record.update({'ip': ip}, { 'ip': ip, 'data': data }, True) # 有着更新, 无则插入 if not res['updatedExisting'] and 'upserted' in res: # 插入时 self.slave_record[ip]['_id'] = str(res['upserted'])
def code_ctrl(self): result = [] for project in Mongo.get().projects.find({}, { '_id': 0, 'code': 1, 'name': 1, 'init_url': 1 }): result.append(project) return result
def toggle_slave(slave_id): try: slave_record = Mongo.get()["slave_record"].find_one({"_id": ObjectId(slave_id)}) if not slave_record: raise Exception("不存在的记录!") except: return jsonify({"success": False, "msg": "不存在的记录!"}) slave_record["data"]["static"] = "抓取中" if slave_record["data"]["static"] == "暂停中" else "暂停中" try: Mongo.get()["slave_record"].update( {"_id": ObjectId(slave_id)}, {"$set": {"data.static": slave_record["data"]["static"]}} ) global_salve_record = GlobalHelper.get("salve_record") global_salve_record[slave_record["ip"]]["static"] = slave_record["data"]["static"] GlobalHelper.set("salve_record", global_salve_record) except: print traceback.format_exc() return jsonify({"success": True, "msg": "切换成功!"})
def get_slave_tasks(slave_id): res = [] try: slave_record = Mongo.get()["slave_record"].find_one({"_id": ObjectId(slave_id)}) if not slave_record: raise Exception("不存在的记录!") except: return json.dumps(res) for project in get_project_list(): for doc in ( Mongo.get()["parsed_" + project["name"]].find({"slave_ip": slave_record["ip"]}).sort("_id", -1).limit(20) ): del doc["_id"] doc["slave_ip"] = mix_ip(doc["slave_ip"]) res.append(doc) res.sort(key=lambda x: x["add_time"], reverse=True) return json.dumps(res)
def get_urls(self): # SerHandle.__init_project(self.project_name) response_url_list = [] ids = [] # print self.__slave_record.slave_record[self.__request_address[0]]['deny_domains'] # todo need to test deny_domains = [x['domain'] for x in self.__slave_record.slave_record[self.__request_address[0]]['deny_domains']] for doc in Mongo.get()['queue_' + self.project_name].find( {'domain': {'$nin': deny_domains}, 'flag_time': {'$lt': int(time.time() - 300)}}).limit(3) \ .sort('_id', pymongo.ASCENDING): # 取标识时间早于当前时间300秒之前的url ids.append(doc['_id']) response_url_list.append(doc['url']) ids and Mongo.get()['queue_' + self.project_name].update({'_id': {'$in': ids}}, {'$set': {'flag_time': int(time.time())}}, multi=True) return response_url_list
def get_slave_tasks(slave_id): res = [] try: slave_record = Mongo.get()['slave_record'].find_one( {'_id': ObjectId(slave_id)}) if not slave_record: raise Exception('不存在的记录!') except: return json.dumps(res) for project in get_project_list(): for doc in Mongo.get()['parsed_' + project['name']].find({ 'slave_ip': slave_record['ip'] }).sort('_id', -1).limit(20): del doc['_id'] doc['slave_ip'] = mix_ip(doc['slave_ip']) res.append(doc) res.sort(key=lambda x: x['add_time'], reverse=True) return json.dumps(res)
def get_results(project_name, page=1): res = [] pagination = paginate( Mongo.get()['result_' + project_name].find().sort('_id', -1), page, 20) if pagination: for doc in pagination.result(): del doc['_id'] res.append(doc) return json.dumps({ 'result': res, 'render_json': pagination.render_json(10) })
def restart_slave(slave_id): try: slave_record = Mongo.get()["slave_record"].find_one({"_id": ObjectId(slave_id)}) if not slave_record: raise Exception("不存在的记录!") except: return jsonify({"success": False, "msg": "不存在的记录!"}) restart_slave_list = GlobalHelper.get("restart_slave_list") or [] restart_slave_list.append(slave_record["ip"]) GlobalHelper.set("restart_slave_list", list(set(restart_slave_list))) return jsonify({"success": True, "msg": "重启中!"})
def restart_slave(slave_id): try: slave_record = Mongo.get()['slave_record'].find_one({'_id': ObjectId(slave_id)}) if not slave_record: raise Exception('不存在的记录!') except: return jsonify({'success': False, 'msg': '不存在的记录!'}) restart_slave_list = GlobalHelper.get('restart_slave_list') or [] restart_slave_list.append(slave_record['ip']) GlobalHelper.set('restart_slave_list', list(set(restart_slave_list))) return jsonify({'success': True, 'msg': '重启中!'})
def urls_parsed(self): urls_data = [] url_list = [] for url in self.__request_json['urls_parsed']: self.__slave_record.add_parsed_record(self.__request_address[0]) url_list.append(url) urls_data.append( {'url': url, 'url_md5': md5(url), 'add_time': int(time.time()), 'slave_ip': self.__request_address[0]}) Mongo.get()['queue_' + self.project_name].remove({'url_md5': {'$in': [md5(l) for l in url_list]}}, multi=True) # 删除抓取完毕的队列 try: urls_data and Mongo.get()['parsed_' + self.project_name].insert(urls_data) except: try: for single_url in urls_data: single_url and Mongo.get()['parsed_' + self.project_name].insert_one(single_url) except Exception, error: print traceback.format_exc() print error print u'下面链接重复抓取的并重复保存到parsed_*中的记录' print single_url, '\r\n\r\n'
def restart_slave(slave_id): try: slave_record = Mongo.get()['slave_record'].find_one( {'_id': ObjectId(slave_id)}) if not slave_record: raise Exception('不存在的记录!') except: return jsonify({'success': False, 'msg': '不存在的记录!'}) restart_slave_list = GlobalHelper.get('restart_slave_list') or [] restart_slave_list.append(slave_record['ip']) GlobalHelper.set('restart_slave_list', list(set(restart_slave_list))) return jsonify({'success': True, 'msg': '重启中!'})
def toggle_slave(slave_id): try: slave_record = Mongo.get()['slave_record'].find_one( {'_id': ObjectId(slave_id)}) if not slave_record: raise Exception('不存在的记录!') except: return jsonify({'success': False, 'msg': '不存在的记录!'}) slave_record['data'][ 'static'] = '抓取中' if slave_record['data']['static'] == '暂停中' else '暂停中' try: Mongo.get()['slave_record'].update( {'_id': ObjectId(slave_id)}, {'$set': { 'data.static': slave_record['data']['static'] }}) global_salve_record = GlobalHelper.get('salve_record') global_salve_record[ slave_record['ip']]['static'] = slave_record['data']['static'] GlobalHelper.set('salve_record', global_salve_record) except: print traceback.format_exc() return jsonify({'success': True, 'msg': '切换成功!'})
def api_test(): try: client = MongoClient('localhost', 27017) res = client['pyfetch']['result_cnbeta'].find().limit(10) # print GlobalHelper.get('salve_record') # time.sleep(10) pagination = paginate(Mongo.get()['result_cnbeta'].find(), 1, 30) if pagination: for row in pagination.current_page(): print row # print pagination.next() # print pagination.prev() except: print traceback.format_exc() return jsonify({'fd': 1})
def __init__(self): self.__init_format = { '_id': '', 'ip': '', 'parsed_count': 0, 'connected_count': 0, 'last_connected_time': 0, 'work_time_count': 1, 'deny_domains': [], 'error_domains': {}, 'static': '抓取中'} self.slave_record = {} self.deny_urls_temp = {} if not self.slave_record: for item in Mongo.get().slave_record.find(): item['data']['_id'] = str(item['_id']) item['data']['ip'] = str(item['ip']) self.slave_record[item['ip']] = item['data'] self.refresh_connect_status()
def __init__(self): self.__init_format = { '_id': '', 'ip': '', 'parsed_count': 0, 'connected_count': 0, 'last_connected_time': 0, 'work_time_count': 1, 'deny_domains': [], 'error_domains': {}, 'static': '抓取中' } self.slave_record = {} self.deny_urls_temp = {} if not self.slave_record: for item in Mongo.get().slave_record.find(): item['data']['_id'] = str(item['_id']) item['data']['ip'] = str(item['ip']) self.slave_record[item['ip']] = item['data'] self.refresh_connect_status()
def code_ctrl(self): result = [] for project in Mongo.get().projects.find({}, {'_id': 0, 'code': 1, 'name': 1, 'init_url': 1}): result.append(project) return result
def get_json_results(project_name): res = make_response(json.dumps(list(Mongo.get()['result_' + project_name].find({}, {'_id': 0}))), 200) # header("Content-Disposition:attachment;filename={$fileName}"); res.headers['Content-Disposition'] = 'attachment' res.headers['filename'] = project_name + '/json' return res
def get_project_by_name(name): res = list(Mongo.get().projects.find({"name": name}, {"_id": 0})) if not res: return jsonify({}) return jsonify(res[0])
def get_json_results(project_name): res = make_response(json.dumps(list(Mongo.get()["result_" + project_name].find({}, {"_id": 0}))), 200) # header("Content-Disposition:attachment;filename={$fileName}"); res.headers["Content-Disposition"] = "attachment" res.headers["filename"] = project_name + "/json" return res
def get_project_by_name(name): res = list(Mongo.get().projects.find({'name': name}, {'_id': 0})) if not res: return jsonify({}) return jsonify(res[0])
def result_save(self): Mongo.get()['result_' + self.project_name].insert(self.__request_json['save'])
def save_project(): form_data = json.loads(request.data) # todo 需要验证表单数据 name_r = re.compile(r'^[0-9a-zA-Z_-]+$') if not name_r.search(form_data['name']): return jsonify({ 'success': False, 'msg': '计划名称必须满足正则规则: ^[0-9a-zA-Z_-]+$ ' }) exists_project = list(Mongo.get()['projects'].find( { 'name': form_data['name'] }, { '_id': 1, 'add_time': 1 }).limit(1)) if 'edit' not in form_data and exists_project: return jsonify({'success': False, 'msg': '计划名称已经存在!'}) # 新增计划或更新计划 data = { 'name': form_data['name'], 'init_url': form_data['init_url'], 'desc': form_data['desc'] if 'desc' in form_data else '', 'code': form_data['code'], 'static': '暂停中', 'update_time': int(time.time()), 'add_time': exists_project[0]['add_time'] if exists_project else int(time.time()), } Mongo.get()['projects'].update({'name': form_data['name']}, data, True) # 当是新计划时的初始化 if 'edit' not in form_data: Mongo.get()['queue_' + form_data['name']].insert({ 'url': form_data['init_url'], 'url_md5': md5(form_data['init_url']), 'flag_time': 0, 'add_time': int(time.time()), 'slave_ip': '0.0.0.0' }) # 在没创建集合前设置索引mongodb会自动创建该集合并赋索引 Mongo.get()['parsed_' + form_data['name']].ensure_index('url_md5', unique=True) Mongo.get()['queue_' + form_data['name']].ensure_index('url_md5', unique=True) # 有新计划加入, 重启全部slave restart_slave_list = GlobalHelper.get('restart_slave_list') or [] for slave_record in Mongo.get()['slave_record'].find(): restart_slave_list.append(slave_record['ip']) GlobalHelper.set('restart_slave_list', list(set(restart_slave_list))) return jsonify({'success': True, 'msg': '保存成功!'})