def __init__(self): PathUtils.initDir() # 创建目录 # 数据库操作 dbUtils = DbUtils('config_3_series') # self.queryItems = dbUtils.select({"id": 511}) # self.queryItems = dbUtils.select(None) self.queryItems = dbUtils.selectByPage(None, 0, 1)
callback=self.articleCommentNum, meta={"item": article_item}) # 获取评论的数量 def articleCommentNum(self, response): article_item = response.meta['item'] response_json = json.loads(response.text) article_item['comment_num'] = response_json['result']['objcounts'][0][ 'replycountall'] yield article_item # 解析当前所在位置 def getLocation(self, originList): locationList = [] for item in originList: if len(item.strip()) > 0: locationList.append(item.strip()) tmp = '' for i in range(1, len(locationList) - 1): tmp = tmp + locationList[i] + '->' return locationList[0] + tmp + locationList[-1] if __name__ == "__main__": execute(['scrapy', 'crawl', 'news_2_article']) dbUtils = DbUtils('news_2_article') queryItems = dbUtils.select(None) excelUtils = ExcelUtils() excelUtils.generateExcel('news', 'news_2_article', list(queryItems))
def __init__(self): # 数据库操作 dbUtils = DbUtils('news_1_main') self.queryItems = dbUtils.selectByPage(None, 0, 100)
from scrapy.cmdline import execute from autohome.items import BrandItem from autohome.spiders.utils.DbUtils import DbUtils from autohome.spiders.utils.ExcelUtils import ExcelUtils class BrandSpider(scrapy.Spider): name = 'config_1_brand' start_urls = [ 'https://www.autohome.com.cn/ashx/AjaxIndexCarFind.ashx?type=1' ] def parse(self, response): responseBody = response.body.decode(response.encoding) brandItems = json.loads(responseBody)['result']['branditems'] for item in brandItems: brandItem = BrandItem() brandItem['_id'] = item['id'] brandItem['name'] = item['name'] brandItem['bfirstletter'] = item['bfirstletter'] brandItem['logo'] = item['logo'] yield brandItem if __name__ == "__main__": execute(['scrapy', 'crawl', 'config_1_brand']) dbUtils = DbUtils('config_1_brand') queryItems = dbUtils.select(None) excelUtils = ExcelUtils() excelUtils.generateExcel('config', 'config_1_brand', list(queryItems))
def from_crawler(cls, crawler): dbUtils = DbUtils('ip_pool') queryItems = dbUtils.select(None) return cls(ip_pool=list(queryItems))
item['content'] = jsonItem['RContent'] item['userImgUrl'] = "https:" + jsonItem['RUserHeaderImage'] item['userName'] = jsonItem['RMemberName'] item['time'] = jsonItem['replydate'] item['floor'] = jsonItem['RFloor'] item['id'] = re.search('&id=([0-9]*)', response.url).group(1) yield item # 判断是否还有下一页数据 if receiveLen < commentCount: try: page = response.meta['page'] page = page + 1 except: page = 2 url = 'https://reply.autohome.com.cn/api/comments/show.json?count=50&page={}&id={}&appid=1&datatype=jsonp&order=0&replyid=0'.format( page, item['id']) yield scrapy.Request(url=url, callback=self.parse, meta={ "page": page, "receiveLen": receiveLen }) if __name__ == "__main__": execute(['scrapy', 'crawl', 'news_3_comment']) dbUtils = DbUtils('news_3_comment') queryItems = dbUtils.select(None) excelUtils = ExcelUtils() excelUtils.generateExcel('news', 'news_3_comment', list(queryItems))
def __init__(self): # 数据库操作 dbUtils = DbUtils('news_2_article')
# 保存Excel def save(self): PathUtils.initDir() rootPath = PathUtils.getRootPath() self.workBook.save(rootPath + '/output/config/config_6_config.xlsx') # 生成Excel def generateExcel(self, resultList): for i in range(len(resultList)): resultItem = {} # 解析数据 self.resolveJson(resultList[i], resultItem) # 生成表格头 self.createHeader(resultItem) # 生成内容 self.createContent(resultItem) # 保存Excel self.save() if __name__ == "__main__": # 数据库操作 dbUtils = DbUtils('config_6_config') queryItems = dbUtils.selectByPage(None, 0, 3) excel = Excel() excel.generateExcel(list(queryItems))
columnWidth = self.strLen((list(resultList[0].keys()))[i]) # 获取一列数据 columnList = self.getColumnList(resultList, i) # 获取该列数据中最大长度 for j in range(len(columnList)): currentWidth = self.strLen(str(columnList[j])) columnWidth = self.getMax(columnWidth, currentWidth) # 设置该列的宽度 if 10 < columnWidth < 30: workSheet.col(i).width = 256 * (columnWidth + 1) # 获取一列数据 def getColumnList(self, resultList, i): columnList = [] for k in range(len(resultList)): columnList.append((list(resultList[k].values()))[i]) return columnList # 获取最大值 def getMax(self, a, b): if a > b: return a return b if __name__ == "__main__": dbUtils = DbUtils('3_series') queryItems = dbUtils.select(None) excelUtils = ExcelUtils() excelUtils.generateExcel('config', '3_series', list(queryItems))
def __init__(self): PathUtils.initDir() # 初始化文件夹 dbUtils = DbUtils('config_5_spec') # 加载数据库 self.queryItems = dbUtils.select(None) # 查询数据
def __init__(self): # 数据库操作 dbUtils = DbUtils('config_1_brand') self.queryItems = dbUtils.select(None)
def __init__(self): # 数据库操作 dbUtils = DbUtils('config_3_series') self.queryItems = dbUtils.select(None)
def __init__(self): # 数据库操作 dbUtils = DbUtils('config_3_series') self.queryItems = dbUtils.select(None) def start_requests(self): for item in self.queryItems: url = 'https://www.autohome.com.cn/ashx/AjaxIndexCarFind.ashx?type=5&value=%s' % item[ 'id'] yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): responseBody = response.body.decode(response.encoding) yearItems = json.loads(responseBody)['result']['yearitems'] for yearItem in yearItems: for item in yearItem['specitems']: resultItem = YearItem() resultItem['id'] = item['id'] resultItem['name'] = item['name'] resultItem['parentId'] = yearItem['id'] yield resultItem if __name__ == "__main__": execute(['scrapy', 'crawl', 'config_5_spec']) dbUtils = DbUtils('config_5_spec') queryItems = dbUtils.select(None) excelUtils = ExcelUtils() excelUtils.generateExcel('config', 'config_5_spec', list(queryItems))
def __init__(self): # 数据库操作 dbUtils = DbUtils('config_1_brand') self.queryItems = dbUtils.select(None) def start_requests(self): for item in self.queryItems: url = 'https://www.autohome.com.cn/ashx/AjaxIndexCarFind.ashx?type=3&value=%s' % item['_id'] yield scrapy.Request(url=url, callback=self.parse) def parse(self, response): parentId = str(response.url).strip().split("value=")[-1] responseBody = response.body.decode(response.encoding) factoryItems = json.loads(responseBody)['result']['factoryitems'] for item in factoryItems: factoryItem = FactoryItem() factoryItem['id'] = item['id'] factoryItem['name'] = item['name'] factoryItem['firstLetter'] = item['firstletter'] factoryItem['parentId'] = parentId yield factoryItem if __name__ == "__main__": execute(['scrapy', 'crawl', 'config_2_factory']) dbUtils = DbUtils('config_2_factory') queryItems = dbUtils.select(None) excelUtils = ExcelUtils() excelUtils.generateExcel('config', 'config_2_factory', list(queryItems))
# coding=utf-8 import json import os import re from selenium import webdriver from autohome.items import ConfigItem from autohome.spiders.utils.DbUtils import DbUtils from autohome.spiders.utils.ExcelUtils import ExcelUtils from autohome.spiders.utils.PathUtils import PathUtils if __name__ == "__main__": # 数据库操作 dbUtils = DbUtils('config_6_config') # 当前项目根目录 rootPath = PathUtils.getRootPath() # 注入的Js injectJs = ( "let rules = '';" "document.createElement = function() {" " return {" " sheet: {" " insertRule: function(rule, i) {" " if (rules.length == 0) {" " rules = '#' + rule;" " } else {" " rules = rules + '#' + rule;" " }" " }"
'.//img/@src').extract()[0] article_short_item['title'] = each_short_article.xpath( './/h3/text()').extract()[0] article_short_item[ 'publicTime'] = each_short_article.xpath( './/span[@class="fn-left"]/text()').extract()[0] article_short_item['readNum'] = each_short_article.xpath( './/span[@class="fn-right"]//em[1]/text()').extract( )[0] article_short_item['shortContent'] = ''.join( each_short_article.xpath( './/p/text()').extract()).strip() yield article_short_item # 请求下一页 next_url_part = response.xpath( '//div[@id="channelPage"]/a[@class="page-item-next"]/@href' ).extract()[0] if next_url_part != '': article_next_url = 'http://www.autohome.com.cn{}'.format( next_url_part) yield scrapy.http.Request(article_next_url, callback=self.parse) if __name__ == "__main__": execute(['scrapy', 'crawl', 'news_1_main']) dbUtils = DbUtils('news_1_main') queryItems = dbUtils.select(None) excelUtils = ExcelUtils() excelUtils.generateExcel('news', 'news_1_main', list(queryItems))