Esempio n. 1
0
 def __init__(self, *args, **kwargs):
     super(CaijingScrapyPipeline, self).__init__(*args, **kwargs)
     self.topic_analyse = article_analyse('topic.wooght')
     self.news_analyse = article_analyse('news.wooght')
     self.add_nums = 0
     self.min_time = time.time() - 90 * 24 * 3600  # 只提取三个月内的数据
     wfunc.e('analyse new success!')
Esempio n. 2
0
 def parse_yicai(self, response):
     items = NewsItem()
     items['title'] = response.xpath('//head/title/text()').extract()[0].strip()
     thetime = response.xpath('//div[@class="m-title f-pr"]/h2//span[2]/text()').extract()[0].strip()
     items['put_time'] = wfunc.time_num(thetime, "%Y-%m-%d %H:%M")
     items['url'] = response.url
     h_num = re.search(r'\/(\d+)\.html', items['url'], re.I).group(1)
     items['only_id'] = h_num
     items['body'] = response.xpath('//div[@class="m-text"]').extract()[0].strip()
     wfunc.e('yicai_news:'+items['title'])
     yield items
 def parse_vreport(self, response):
     items = TopicItem()
     items['title'] = response.xpath('//h1/text()').extract_first().strip()
     thetime = response.xpath(
         '//div[@class="creab"]/span[4]/text()').extract()[0].strip()
     thetime = time_num(thetime.split(':')[1], '%Y-%m-%d')
     items['put_time'] = thetime
     url_re = re.search(r'\/(\d+)\/index\.phtml$', response.url, re.I)
     items['url'] = response.url
     items['only_id'] = url_re.group(1)
     items['body'] = response.xpath(
         '//div[@class="blk_container"]').extract()[0].strip()
     e('sina_topic:' + items['title'])
     yield items
Esempio n. 4
0
 def parse_topic(self,response):
     items = TopicItem()
     thetime = response.xpath('//span[@class="timer"]/text()').extract_first().strip()[:10]
     items['put_time'] = wfunc.time_num(thetime,"%Y-%m-%d")
     items['title'] = response.xpath('//h1[@class="title"]/text()').extract_first().strip()
     items['body'] = ''
     body = response.xpath('//div[@id="qmt_content_div"]//p/text()').extract()
     for str in body:
         items['body']+=str
     items['url'] = response.url
     url_re = re.search(r'.*\/company\/scp_ggjd\/tjd_bbdj\/(\d+)\/(\d+).htm$',items['url'],re.I)  #http://ggjd.cnstock.com/company/scp_ggjd/tjd_bbdj/201604/3765369.htm
     items['only_id'] = url_re.group(1)+url_re.group(2)
     wfunc.e('cnstock_topics:'+items['title'])
     yield items
Esempio n. 5
0
 def parse_jrj(self, response):
     items = TopicItem()
     items['title'] = response.xpath(
         '//div[@class="titmain"]/h1/text()').extract()[2].strip()
     thetime = response.xpath(
         '//p[@class="inftop"]//span[1]/text()').extract_first().strip()
     thedata = thetime.split(" ")
     items['put_time'] = wfunc.time_num(thedata[0], '%Y-%m-%d')
     url_re = re.search(r'.*\/(\d+)\/(\d+)\/(\d+)\.shtml$', response.url,
                        re.I)
     items['url'] = response.url
     items['only_id'] = url_re.group(3)
     items['body'] = response.xpath(
         '//div[@class="texttit_m1"]').extract()[0].strip()
     wfunc.e('jrj_topic:' + items['title'])
     yield items
Esempio n. 6
0
 def parse_163_money(self, response):
     # http://money.163.com/17/1114/13/D375MGIB0025814V.html
     items = NewsItem()
     items['title'] = response.xpath('//div[@id="epContentLeft"]/h1[1]/text()').extract()[0].strip()
     bodys = response.xpath('//div[@id="endText"]//p').extract()
     body_str = ''
     for ii in bodys:
         body_str += ii.strip()
     items['body'] = body_str
     items['url'] = response.url
     url_re = re.search(r'.*\.163\.com\/\d+\/\d+\/\d+\/(\w*)\.html$', items['url'], re.I)
     items['only_id'] = url_re.group(1)
     thetime = response.xpath('//div[@class="post_time_source"]/text()').extract_first().strip()
     items['put_time'] = wfunc.time_num(thetime[:16], "%Y-%m-%d %H:%M")
     wfunc.e('163_news:' + items['title'])
     yield items
Esempio n. 7
0
 def parse_notices(self, response):
     items = NoticesItem()
     items['datatime'] = response.xpath('//span[@class="timer"]/text()').extract_first().strip()[:10]
     items['title'] = response.xpath('//h1[@class="title"]/text()').extract_first().strip()
     meta = response.xpath('//meta[@name="keywords"]/@content').extract_first()
     company = meta.split(' ')
     if(len(company)>1):
         items['code_id'] = company[0]
     else:
         items['code_id'] = 0
     items['body'] = ''
     body = response.xpath('//div[@id="qmt_content_div"]//p/text()').extract()
     for str in body:
         items['body']+=str
     wfunc.e('notices:'+items['title'])
     yield items
Esempio n. 8
0
 def parse_xueqiu(self, response):
     items = TopicItem()
     items['title'] = response.xpath('//title/text()').extract()[0].strip()
     thetime = response.xpath(
         '//a[@class="time"]/@data-created_at').extract()
     if len(thetime) < 1:
         thetime = response.xpath(
             '//a[@class="edit-time"]/@data-created_at').extract()
     # thetime = wfunc.search_time(thetime)
     items['put_time'] = thetime[0][0:10].strip()  # 截取长度   不包括后边界
     url_re = re.search(r'.*\/(\d+)\/(\d+)$', response.url, re.I)
     items['url'] = response.url
     items['only_id'] = url_re.group(1) + url_re.group(2)
     items['body'] = response.xpath(
         '//div[@class="article__bd__detail"]').extract()[0].strip()
     wfunc.e('xueqiu_topic:' + items['title'])
     yield items
Esempio n. 9
0
 def page_parse(self,response):
     item = QandaItem()
     max_num = response.meta['max_num']
     json_str = response.body
     json_obj = json.loads(json_str.decode('utf-8'))
     for one in json_obj['statuses']:
         item['body'] = one['description']
         item['only_id'] = one['id']
         item['put_time'] = one['created_at']
         item['url'] = response.url
         wfunc.e(str(one['id'])+',success')
         yield item
     #如果有翻页
     if(json_obj['maxPage']>1):
         num = json_obj['maxPage']
         while(num>1):
             url = "https://xueqiu.com/interview/answer/list.json?interviewId=%s&page=%s&access_token=%s&_=%s"%(str(max_num),num,self.token,self.url_time)
             request_new = scrapy.Request(url,callback=self.page_parse)
             request_new.meta['max_num'] = str(max_num)
             yield request_new
             num-=1
Esempio n. 10
0
 def parse_qq_ywq(self, response):
     # http://stock.qq.com/a/20171107/017324.htm
     items = NewsItem()
     items['title'] = response.xpath('//title/text()').extract()[0].strip()
     bodys = response.xpath('//div[@id="Cnt-Main-Article-QQ"]//p').extract()  # 得到的是列表
     body_str = ''
     for ii in bodys:
         body_str += ii.strip()
     items['body'] = body_str
     items['url'] = response.url
     url_re = re.search(r'.*a\/(\d+)\/(\d+).htm', items['url'], re.I)
     items['only_id'] = url_re.group(1) + url_re.group(2)
     thetime = response.xpath('//span[@class="a_time"]/text()')
     if (len(thetime) < 1):
         thetime = response.xpath('//span[@class="pubTime article-time"]/text()')
     try:
         items['put_time'] = wfunc.time_num(thetime.extract()[0].strip(), "%Y-%m-%d %H:%M")
     except IndexError as e:
         print('IndexError:dont fond time-->', response.url)
         return None
     wfunc.e('qq_news:' + items['title'])
     yield items
Esempio n. 11
0
 def open_spider(self, spider):
     wfunc.e('spider ' + spider.name + ' --->opend')
     if spider.name in ['ddtj', 'ddtj_history']:
         s = T.select([T.ddtj.c.only_id])
         r = T.conn.execute(s)
         ddtj_onlyid = []
         for item in r.fetchall():
             ddtj_onlyid.append(item[0])
         self.ddtj_onlyid = ddtj_onlyid
     if spider.name == 'xueqiu_zuhe':
         s = T.select([T.xq_zuhe.c.zh_symbol])
         r = T.conn.execute(s)
         zh_list = []
         for item in r.fetchall():
             zh_list.append(item[0])
         self.zh_list = zh_list
     if spider.name == 'zuhe_change':
         s = T.select([T.zuhe_change.c.change_id])
         r = T.conn.execute(s)
         change_list = []
         for item in r.fetchall():
             change_list.append(item[0])
         self.change_list = change_list
Esempio n. 12
0
 def parse_sina(self, response):
     # http://finance.sina.com.cn/stock/s/2017-11-06/doc-ifynmvuq9022743.shtml
     items = NewsItem()
     if len(response.xpath('//title/text()').extract()) > 0:
         items['title'] = response.xpath('//title/text()').extract()[0].strip()
     else:
         items['title'] = ' '
     bodys = response.xpath('//div[@id="artibody"]//p').extract()  # 得到的是列表
     body_str = ''
     for ii in bodys:
         body_str += ii.strip()
     items['body'] = body_str
     items['url'] = response.url
     url_re = re.search(r'doc\-\D+(\d*)\.shtml', items['url'], re.I)
     items['only_id'] = url_re.group(1)
     # thetime = response.xpath('//span[@class="time-source"]/text()').extract()[0].strip()
     try:
         thetime = response.xpath('//*[@id="top_bar"]/div/div[2]/span[1]').extract_first().strip()
     except AttributeError:
         thetime = response.xpath('//span[@class="time-source"]/text()').extract()[0].strip()
     items['put_time'] = wfunc.sina_get_time(thetime)
     wfunc.e('sina_news:' + items['title'])
     yield items
Esempio n. 13
0
 def start_requests(self):
     r = scrapy.Request(self.start_urls[0],callback=self.wstart_parse)
     r.meta['phantomjs'] = True
     wfunc.e(r)
     yield r
Esempio n. 14
0
 def open_spider(self, spider):
     wfunc.e('spider ' + spider.name + ' --->opend')