def __init__(self, *args, **kwargs): super(CaijingScrapyPipeline, self).__init__(*args, **kwargs) self.topic_analyse = article_analyse('topic.wooght') self.news_analyse = article_analyse('news.wooght') self.add_nums = 0 self.min_time = time.time() - 90 * 24 * 3600 # 只提取三个月内的数据 wfunc.e('analyse new success!')
def parse_yicai(self, response): items = NewsItem() items['title'] = response.xpath('//head/title/text()').extract()[0].strip() thetime = response.xpath('//div[@class="m-title f-pr"]/h2//span[2]/text()').extract()[0].strip() items['put_time'] = wfunc.time_num(thetime, "%Y-%m-%d %H:%M") items['url'] = response.url h_num = re.search(r'\/(\d+)\.html', items['url'], re.I).group(1) items['only_id'] = h_num items['body'] = response.xpath('//div[@class="m-text"]').extract()[0].strip() wfunc.e('yicai_news:'+items['title']) yield items
def parse_vreport(self, response): items = TopicItem() items['title'] = response.xpath('//h1/text()').extract_first().strip() thetime = response.xpath( '//div[@class="creab"]/span[4]/text()').extract()[0].strip() thetime = time_num(thetime.split(':')[1], '%Y-%m-%d') items['put_time'] = thetime url_re = re.search(r'\/(\d+)\/index\.phtml$', response.url, re.I) items['url'] = response.url items['only_id'] = url_re.group(1) items['body'] = response.xpath( '//div[@class="blk_container"]').extract()[0].strip() e('sina_topic:' + items['title']) yield items
def parse_topic(self,response): items = TopicItem() thetime = response.xpath('//span[@class="timer"]/text()').extract_first().strip()[:10] items['put_time'] = wfunc.time_num(thetime,"%Y-%m-%d") items['title'] = response.xpath('//h1[@class="title"]/text()').extract_first().strip() items['body'] = '' body = response.xpath('//div[@id="qmt_content_div"]//p/text()').extract() for str in body: items['body']+=str items['url'] = response.url url_re = re.search(r'.*\/company\/scp_ggjd\/tjd_bbdj\/(\d+)\/(\d+).htm$',items['url'],re.I) #http://ggjd.cnstock.com/company/scp_ggjd/tjd_bbdj/201604/3765369.htm items['only_id'] = url_re.group(1)+url_re.group(2) wfunc.e('cnstock_topics:'+items['title']) yield items
def parse_jrj(self, response): items = TopicItem() items['title'] = response.xpath( '//div[@class="titmain"]/h1/text()').extract()[2].strip() thetime = response.xpath( '//p[@class="inftop"]//span[1]/text()').extract_first().strip() thedata = thetime.split(" ") items['put_time'] = wfunc.time_num(thedata[0], '%Y-%m-%d') url_re = re.search(r'.*\/(\d+)\/(\d+)\/(\d+)\.shtml$', response.url, re.I) items['url'] = response.url items['only_id'] = url_re.group(3) items['body'] = response.xpath( '//div[@class="texttit_m1"]').extract()[0].strip() wfunc.e('jrj_topic:' + items['title']) yield items
def parse_163_money(self, response): # http://money.163.com/17/1114/13/D375MGIB0025814V.html items = NewsItem() items['title'] = response.xpath('//div[@id="epContentLeft"]/h1[1]/text()').extract()[0].strip() bodys = response.xpath('//div[@id="endText"]//p').extract() body_str = '' for ii in bodys: body_str += ii.strip() items['body'] = body_str items['url'] = response.url url_re = re.search(r'.*\.163\.com\/\d+\/\d+\/\d+\/(\w*)\.html$', items['url'], re.I) items['only_id'] = url_re.group(1) thetime = response.xpath('//div[@class="post_time_source"]/text()').extract_first().strip() items['put_time'] = wfunc.time_num(thetime[:16], "%Y-%m-%d %H:%M") wfunc.e('163_news:' + items['title']) yield items
def parse_notices(self, response): items = NoticesItem() items['datatime'] = response.xpath('//span[@class="timer"]/text()').extract_first().strip()[:10] items['title'] = response.xpath('//h1[@class="title"]/text()').extract_first().strip() meta = response.xpath('//meta[@name="keywords"]/@content').extract_first() company = meta.split(' ') if(len(company)>1): items['code_id'] = company[0] else: items['code_id'] = 0 items['body'] = '' body = response.xpath('//div[@id="qmt_content_div"]//p/text()').extract() for str in body: items['body']+=str wfunc.e('notices:'+items['title']) yield items
def parse_xueqiu(self, response): items = TopicItem() items['title'] = response.xpath('//title/text()').extract()[0].strip() thetime = response.xpath( '//a[@class="time"]/@data-created_at').extract() if len(thetime) < 1: thetime = response.xpath( '//a[@class="edit-time"]/@data-created_at').extract() # thetime = wfunc.search_time(thetime) items['put_time'] = thetime[0][0:10].strip() # 截取长度 不包括后边界 url_re = re.search(r'.*\/(\d+)\/(\d+)$', response.url, re.I) items['url'] = response.url items['only_id'] = url_re.group(1) + url_re.group(2) items['body'] = response.xpath( '//div[@class="article__bd__detail"]').extract()[0].strip() wfunc.e('xueqiu_topic:' + items['title']) yield items
def page_parse(self,response): item = QandaItem() max_num = response.meta['max_num'] json_str = response.body json_obj = json.loads(json_str.decode('utf-8')) for one in json_obj['statuses']: item['body'] = one['description'] item['only_id'] = one['id'] item['put_time'] = one['created_at'] item['url'] = response.url wfunc.e(str(one['id'])+',success') yield item #如果有翻页 if(json_obj['maxPage']>1): num = json_obj['maxPage'] while(num>1): url = "https://xueqiu.com/interview/answer/list.json?interviewId=%s&page=%s&access_token=%s&_=%s"%(str(max_num),num,self.token,self.url_time) request_new = scrapy.Request(url,callback=self.page_parse) request_new.meta['max_num'] = str(max_num) yield request_new num-=1
def parse_qq_ywq(self, response): # http://stock.qq.com/a/20171107/017324.htm items = NewsItem() items['title'] = response.xpath('//title/text()').extract()[0].strip() bodys = response.xpath('//div[@id="Cnt-Main-Article-QQ"]//p').extract() # 得到的是列表 body_str = '' for ii in bodys: body_str += ii.strip() items['body'] = body_str items['url'] = response.url url_re = re.search(r'.*a\/(\d+)\/(\d+).htm', items['url'], re.I) items['only_id'] = url_re.group(1) + url_re.group(2) thetime = response.xpath('//span[@class="a_time"]/text()') if (len(thetime) < 1): thetime = response.xpath('//span[@class="pubTime article-time"]/text()') try: items['put_time'] = wfunc.time_num(thetime.extract()[0].strip(), "%Y-%m-%d %H:%M") except IndexError as e: print('IndexError:dont fond time-->', response.url) return None wfunc.e('qq_news:' + items['title']) yield items
def open_spider(self, spider): wfunc.e('spider ' + spider.name + ' --->opend') if spider.name in ['ddtj', 'ddtj_history']: s = T.select([T.ddtj.c.only_id]) r = T.conn.execute(s) ddtj_onlyid = [] for item in r.fetchall(): ddtj_onlyid.append(item[0]) self.ddtj_onlyid = ddtj_onlyid if spider.name == 'xueqiu_zuhe': s = T.select([T.xq_zuhe.c.zh_symbol]) r = T.conn.execute(s) zh_list = [] for item in r.fetchall(): zh_list.append(item[0]) self.zh_list = zh_list if spider.name == 'zuhe_change': s = T.select([T.zuhe_change.c.change_id]) r = T.conn.execute(s) change_list = [] for item in r.fetchall(): change_list.append(item[0]) self.change_list = change_list
def parse_sina(self, response): # http://finance.sina.com.cn/stock/s/2017-11-06/doc-ifynmvuq9022743.shtml items = NewsItem() if len(response.xpath('//title/text()').extract()) > 0: items['title'] = response.xpath('//title/text()').extract()[0].strip() else: items['title'] = ' ' bodys = response.xpath('//div[@id="artibody"]//p').extract() # 得到的是列表 body_str = '' for ii in bodys: body_str += ii.strip() items['body'] = body_str items['url'] = response.url url_re = re.search(r'doc\-\D+(\d*)\.shtml', items['url'], re.I) items['only_id'] = url_re.group(1) # thetime = response.xpath('//span[@class="time-source"]/text()').extract()[0].strip() try: thetime = response.xpath('//*[@id="top_bar"]/div/div[2]/span[1]').extract_first().strip() except AttributeError: thetime = response.xpath('//span[@class="time-source"]/text()').extract()[0].strip() items['put_time'] = wfunc.sina_get_time(thetime) wfunc.e('sina_news:' + items['title']) yield items
def start_requests(self): r = scrapy.Request(self.start_urls[0],callback=self.wstart_parse) r.meta['phantomjs'] = True wfunc.e(r) yield r
def open_spider(self, spider): wfunc.e('spider ' + spider.name + ' --->opend')