def parse(self, response): #爬到数据后的回调 sel = database.conn(self) jsons = json.loads(response.body) content_all = {} #内容dict dict_index = 0 #内容dict 索引 # print('alive') # print(jsons['workers']['data'][0]) # print('die') # print(jsons['workers']['data'][1]) item = SparkpoolItem() for worker in jsons['workers']['data']: if worker['hashrate'] <= 0: content = {} content["computer_name"] = worker['rig'] # content["computer_name_num"] = re.search('[1-9]\d*', content["computer_name"]).group() content['bar_id'] = self.bar_id[response.url] sel.cursor.execute("select bp.id from board_list as b INNER join board_port_list as bp on b.id = bp.board_id where bp.close_time>0 and b.bar_id = "+str(content['bar_id'])+" and FIND_IN_SET('"+str(content['computer_name'])+"',bp.comp_id)") port_id = sel.cursor.fetchone() if port_id is None: content['port_id'] = 0 else: content['port_id'] = port_id[0] content_all[dict_index] = content #附加到dict dict_index += 1 item['scan_url'] = response.url #本次爬取url item['scan_content'] = json.dumps(content_all,ensure_ascii=False) #数据格式为json yield item
def __init__(self): #查询获取爬行地址 self = database.conn(self) #数据库实例 #self.cursor.execute("select url,bar_id from url_list where (url like '%f2pool%' or url like '%vvpool%')") self.cursor.execute("select url,bar_id from url_list where id = 14") info = self.cursor.fetchall() for vo in info: #爬虫地址数组 self.start_urls.append(vo[0]) #额外拿出板子id string self.cursor.execute("select id from board_list where bar_id = "+str(vo[1])) board_info = self.cursor.fetchall() board_list = '' for vo2 in board_info: board_list += str(vo2[0])+',' board_list = board_list[:-1] #根据板子id 获取所有机器名称 self.cursor.execute("select comp_id from board_port_list where board_id in ("+board_list+")") machine_info = self.cursor.fetchall() machine_list = '' for vo3 in machine_info: if vo3[0] != '': machine_list += str(vo3[0])+',' machine_list = machine_list[:-1] machine_list = machine_list.split(',') #网吧id和网吧机器映射 self.bar_id[vo[1]]=machine_list
def __init__(self): #查询获取爬行地址 self = database.conn(self) #数据库实例 self.cursor.execute("select url,bar_id from url_list where url like '%sparkpool%'") info = self.cursor.fetchall() for vo in info: before = re.search('.*(?=/#)', vo[0]).group() after = re.search('(?<=0x).*', vo[0]).group() self.start_urls.append(before+'/api/page/miner?value='+after) self.bar_id[vo[0]]=vo[1]
def parse(self, response): #爬到数据后的回调 sel = database.conn(self) content_all = {} #内容dict dict_index = 0 #内容dict 索引 for vo in response.xpath('//table[@id="workers"]/tbody/tr'): #xpath截取数据 item = F2poolItem() content = {} computer_name = vo.xpath('td[1]') content["computer_name"] = computer_name.xpath('string(.)').extract()[0] # content["computer_name_num"] = re.search('[1-9]\d*', content["computer_name"]).group() #电脑名称取数字 # 默认网吧id 0 ,根据电脑名称遍历所有网吧机器,获取对应网吧id content["bar_id"] = 0 for idx in self.bar_id: if content["computer_name"].lower() in self.bar_id[idx]: content["bar_id"] = idx break default_24_min = vo.xpath('td[4]') content["default_24_min"] = default_24_min.xpath('string(.)').extract()[0] #ak 在第五行 content["time_local"] = vo.xpath('td[5]/span[1]/script').re('\d+\.?\d*') #eth 在第六行 content["time_local6"] = vo.xpath('td[6]/span[1]/script').re('\d+\.?\d*') if content['time_local'] or content["time_local6"] or content["default_24_min"] == '': #查询要关闭的端口号 sel.cursor.execute("select bp.id from board_list as b INNER join board_port_list as bp on b.id = bp.board_id where bp.close_time>0 and b.bar_id = "+str(content['bar_id'])+" and FIND_IN_SET('"+str(content['computer_name'])+"',bp.comp_id)") port_id = sel.cursor.fetchone() print(port_id) if port_id is None: content['port_id'] = 0 else: content['port_id'] = port_id[0] else: content['port_id'] = 0 content_all[dict_index] = content #附加到dict dict_index += 1 item['scan_url'] = response.url #本次爬取url item['scan_content'] = json.dumps(content_all,ensure_ascii=False) #数据格式为json yield item
def parse(self, response): #爬到数据后的回调 sel = database.conn(self) content_all = {} #内容dict dict_index = 0 #内容dict 索引 for vo in response.xpath('//*[@id="online-list"]/tr'): #xpath截取数据 item = UupoolItem() content = {} computer_name = vo.xpath('td[2]') content["computer_name"] = computer_name.xpath( 'string(.)').extract()[0] # 默认网吧id 0 ,根据电脑名称遍历所有网吧机器,获取对应网吧id content["bar_id"] = 0 for idx in self.bar_id: if content["computer_name"].lower() in self.bar_id[idx]: content["bar_id"] = idx break # content["computer_name_num"] = re.search('[1-9]\d*', content["computer_name"]).group() #电脑名称取数字 # content["bar_id"] = self.bar_id[response.url] #查询要关闭的端口号 # print('sql:'+"select bp.id from board_list as b INNER join board_port_list as bp on b.id = bp.board_id where bp.close_time>0 and b.bar_id = "+str(content['bar_id'])+" and FIND_IN_SET('"+str(content['computer_name'])+"',bp.comp_id)") sel.cursor.execute( "select bp.id from board_list as b INNER join board_port_list as bp on b.id = bp.board_id where bp.close_time>0 and b.bar_id = " + str(content['bar_id']) + " and FIND_IN_SET('" + str(content['computer_name']) + "',bp.comp_id)") port_id = sel.cursor.fetchone() if port_id is None: content['port_id'] = 0 else: content['port_id'] = port_id[0] content_all[dict_index] = content #附加到dict dict_index += 1 item['scan_url'] = response.url #本次爬取url item['scan_content'] = json.dumps(content_all, ensure_ascii=False) #数据格式为json yield item
def process_item(self, item, spider): self = database.conn(self) #数据库实例 if spider.name == 'f2pool' or spider.name == 'sparkpool' or spider.name == 'uupool': content = item['scan_content'] content_data = json.loads(content) comp_name_list = [] #第三方网站所有网吧list port_list = [] #缓存本次关闭端口列表,重复端口只写一次 now_bar_id = 0 #当前查询的网吧 now_bar_comp_list = [] #当前网吧所有端口+电脑名称 list all_bar_comp_name = [] #当前网吧所有电脑名称list no_find_list = [] #库里有但第三方网站没有的网吧名称 for info in content_data: computer_name = content_data[info]['computer_name'] bar_id = content_data[info]['bar_id'] port_id = content_data[info]['port_id'] comp_name_list.append(computer_name) #根据bar_id 获取网吧对应所有电脑列表 if now_bar_id != bar_id: now_bar_comp_list = [] now_bar_id = bar_id self.cursor.execute( "select bp.id,bp.comp_id from board_list as b INNER join board_port_list as bp on b.id = bp.board_id where bp.close_time>0 and b.bar_id = " + str(bar_id) + " and bp.comp_id <> ''") bar_comp_list = self.cursor.fetchall() for comp_list in bar_comp_list: foo = [] foo.append(comp_list[0]) foo.append(comp_list[1]) for voo in comp_list[1].split(','): all_bar_comp_name.append(voo) now_bar_comp_list.append(foo) #重复端口id只写一次 if port_id not in port_list: #端口id不为0写入 if port_id != 0: port_list.append(port_id) sql = "INSERT INTO close_list(bar_id, computer_name ,board_port_id,created_at) VALUES ('" + str( bar_id) + "','" + computer_name + "','" + str( port_id) + "',NOW())" self.cursor.execute(sql) print('comp_name_list', comp_name_list) #遍历 将所有未出现的电脑名称写入 no_find_list #for comp in all_bar_comp_name: # if comp not in comp_name_list: # no_find_list.append(comp) # 未找到列表寻找端口 #for comp in no_find_list: # for vo in now_bar_comp_list: # #找到端口,写入库 # if comp in vo[1].split(','): # #重复端口id只写一次 # if vo[0] not in port_list: # #端口id不为0写入 # if port_id != 0: # port_list.append(vo[0]) # sql = "INSERT INTO close_list(bar_id, computer_name ,board_port_id,created_at) VALUES ('"+str(bar_id)+"','"+comp+"','"+str(vo[0])+"',NOW())" # self.cursor.execute(sql) try: self.db.commit() except Exception as e: print(e) self.db.rollback() elif spider.name == 'vvpool': pass return item