def proxyList(request): db = MongoDBCli() spider_list = db.getAllSpider() content = { "spiderTasks": [], "taskList": [], } def __sortBydatetime(elem): return elem["time"] for spider in spider_list: # print(spider) tmp = {} tmp["name"] = spider["name"] tmp["url"] = urlparse( spider["config"]["url"]).scheme + "://" + urlparse( spider["config"]["url"]).netloc tmp["description"] = spider["description"] tmp["recentTime"] = datetime.datetime.now() tmp["spiderName"] = spider["config"]["name"] tmp["useRange"] = spider["config"]["useRange"] tmp["startIndex"] = spider["startIndex"] tmp["endIndex"] = spider["endIndex"] content["spiderTasks"].append(tmp) content["taskList"] = getLogInfoList() content["taskList"].sort(key=__sortBydatetime, reverse=True) return render(request, "proxy/proxyList.html", context=content)
def post(self, request): receive_data = checkJson(request) if receive_data == False: return JsonResponse({ "flag": False, "massage": "spider info 参数错误", }) db = MongoDBCli() db_ret = db.setOneSpider(receive_data) if db_ret != None: return JsonResponse({"flag": True, "massage": "OK"}) return JsonResponse({"flag": False, "massage": "编辑错误"})
def get(self, request, spider_name): # print(time.time()) # print(request.path) db = MongoDBCli() spider = db.getOneSpiderFromSpiderName(spider_name) # print("spider", spider) if spider == None: return render(request, '404.html') content = { "spider": spider, "spider_config_json": json.dumps(spider["config"]), } # print("content", content) return render(request, 'proxy/editSpiderConfig.html', context=content)
def runSpider(request): print(request.POST) spider_name = request.POST.get("spiderName", None) if spider_name == None: return HttpResponse("错误的请求") db = MongoDBCli() spider_config = db.getOneSpiderFromSpiderName(spider_name) if spider_config == None: return HttpResponse("没有这个爬虫") result = task_runSpider.delay( spider_config["config"]["name"], getRandomLogFileName(spider_config["config"]["name"]), "-a si={} -a ei={}".format( spider_config["startIndex"], spider_config["endIndex"], )) return HttpResponse(result)
def task_runSpider(spider_name, log_file=None, param=""): if log_file == None: log_file = getRandomLogFileName(spider_name + "-beat") db = MongoDBCli() spider_config = db.getOneSpiderFromSpiderName(spider_name) param = "-a si={} -a ei={}".format( spider_config["startIndex"], spider_config["endIndex"], ) os.chdir(PROXY_SPIDER_DIR) log_file_abs = os.path.join(PROXY_SPIDER_LOG_DIR, log_file) print(log_file_abs) cmd = 'scrapy crawl genericSpider -a cn={} -s LOG_FILE={} {}'.format( spider_name, log_file_abs, param) print(cmd) # cmdline.execute(cmd.split()) # subprocess.Popen("notepad") # os.system(cmd) subprocess.Popen(cmd) return log_file
def post(self, request, spider_name): # receive_spider = json.loads() db = MongoDBCli() spider = db.getOneSpiderFromSpiderName(spider_name) if spider == None: return JsonResponse({ "flag": False, "massage": "没有这个爬虫", }) r_tmp = checkJson(request) # print(r_tmp) if r_tmp == False: return JsonResponse({ "flag": False, "massage": "spider info 参数错误", }) # print(spider, "\n>>\n", r_tmp) db_ret = db.setOneSpider(r_tmp) if db_ret != None: return JsonResponse({"flag": True, "massage": "OK"}) return JsonResponse({"flag": False, "massage": "编辑错误"})
"useRange": True, "header": { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "ip.jiangxianli.com", "Pragma": "no-cache", "Referer": "https://ip.jiangxianli.com/?page=1", "Upgrade-Insecure-Requests": "1", }, "oneLine": ["//tbody/tr", ], "ip": "td[1]/text()", "port": "td[2]/text()" }, ] db = MongoDBCli() SPIDER_INFO_LIST = [] SPIDER_INFO_LIST.extend(db.getAllSpiderConfig()) SPIDER_NAME_LIST = [ x["name"] for x in SPIDER_INFO_LIST ] if __name__ == "__main__": print(SPIDER_NAME_LIST) print(SPIDER_INFO_LIST) print(json.dumps(SPIDER_INFO_LIST1[2]))