Example #1
0
 def parse_zxg_apinews(self, response):
     items = NewsItem()
     api = self.get_json(response)
     items['title'] = api['data']['title']
     items['only_id'] = api['data']['id']
     items['body'] = api['data']['content']['text']
     items['put_time'] = wfunc.time_num(api['data']['id'][:8], "%Y%m%d")
     items['url'] = api['data']['surl']
     yield items
Example #2
0
 def parse_yicai(self, response):
     items = NewsItem()
     items['title'] = response.xpath('//head/title/text()').extract()[0].strip()
     thetime = response.xpath('//div[@class="m-title f-pr"]/h2//span[2]/text()').extract()[0].strip()
     items['put_time'] = wfunc.time_num(thetime, "%Y-%m-%d %H:%M")
     items['url'] = response.url
     h_num = re.search(r'\/(\d+)\.html', items['url'], re.I).group(1)
     items['only_id'] = h_num
     items['body'] = response.xpath('//div[@class="m-text"]').extract()[0].strip()
     wfunc.e('yicai_news:'+items['title'])
     yield items
 def parse_vreport(self, response):
     items = TopicItem()
     items['title'] = response.xpath('//h1/text()').extract_first().strip()
     thetime = response.xpath(
         '//div[@class="creab"]/span[4]/text()').extract()[0].strip()
     thetime = time_num(thetime.split(':')[1], '%Y-%m-%d')
     items['put_time'] = thetime
     url_re = re.search(r'\/(\d+)\/index\.phtml$', response.url, re.I)
     items['url'] = response.url
     items['only_id'] = url_re.group(1)
     items['body'] = response.xpath(
         '//div[@class="blk_container"]').extract()[0].strip()
     e('sina_topic:' + items['title'])
     yield items
Example #4
0
 def parse_topic(self,response):
     items = TopicItem()
     thetime = response.xpath('//span[@class="timer"]/text()').extract_first().strip()[:10]
     items['put_time'] = wfunc.time_num(thetime,"%Y-%m-%d")
     items['title'] = response.xpath('//h1[@class="title"]/text()').extract_first().strip()
     items['body'] = ''
     body = response.xpath('//div[@id="qmt_content_div"]//p/text()').extract()
     for str in body:
         items['body']+=str
     items['url'] = response.url
     url_re = re.search(r'.*\/company\/scp_ggjd\/tjd_bbdj\/(\d+)\/(\d+).htm$',items['url'],re.I)  #http://ggjd.cnstock.com/company/scp_ggjd/tjd_bbdj/201604/3765369.htm
     items['only_id'] = url_re.group(1)+url_re.group(2)
     wfunc.e('cnstock_topics:'+items['title'])
     yield items
Example #5
0
 def parse_jrj(self, response):
     items = TopicItem()
     items['title'] = response.xpath(
         '//div[@class="titmain"]/h1/text()').extract()[2].strip()
     thetime = response.xpath(
         '//p[@class="inftop"]//span[1]/text()').extract_first().strip()
     thedata = thetime.split(" ")
     items['put_time'] = wfunc.time_num(thedata[0], '%Y-%m-%d')
     url_re = re.search(r'.*\/(\d+)\/(\d+)\/(\d+)\.shtml$', response.url,
                        re.I)
     items['url'] = response.url
     items['only_id'] = url_re.group(3)
     items['body'] = response.xpath(
         '//div[@class="texttit_m1"]').extract()[0].strip()
     wfunc.e('jrj_topic:' + items['title'])
     yield items
Example #6
0
 def parse_163_money(self, response):
     # http://money.163.com/17/1114/13/D375MGIB0025814V.html
     items = NewsItem()
     items['title'] = response.xpath('//div[@id="epContentLeft"]/h1[1]/text()').extract()[0].strip()
     bodys = response.xpath('//div[@id="endText"]//p').extract()
     body_str = ''
     for ii in bodys:
         body_str += ii.strip()
     items['body'] = body_str
     items['url'] = response.url
     url_re = re.search(r'.*\.163\.com\/\d+\/\d+\/\d+\/(\w*)\.html$', items['url'], re.I)
     items['only_id'] = url_re.group(1)
     thetime = response.xpath('//div[@class="post_time_source"]/text()').extract_first().strip()
     items['put_time'] = wfunc.time_num(thetime[:16], "%Y-%m-%d %H:%M")
     wfunc.e('163_news:' + items['title'])
     yield items
Example #7
0
 def parse_qq_ywq(self, response):
     # http://stock.qq.com/a/20171107/017324.htm
     items = NewsItem()
     items['title'] = response.xpath('//title/text()').extract()[0].strip()
     bodys = response.xpath('//div[@id="Cnt-Main-Article-QQ"]//p').extract()  # 得到的是列表
     body_str = ''
     for ii in bodys:
         body_str += ii.strip()
     items['body'] = body_str
     items['url'] = response.url
     url_re = re.search(r'.*a\/(\d+)\/(\d+).htm', items['url'], re.I)
     items['only_id'] = url_re.group(1) + url_re.group(2)
     thetime = response.xpath('//span[@class="a_time"]/text()')
     if (len(thetime) < 1):
         thetime = response.xpath('//span[@class="pubTime article-time"]/text()')
     try:
         items['put_time'] = wfunc.time_num(thetime.extract()[0].strip(), "%Y-%m-%d %H:%M")
     except IndexError as e:
         print('IndexError:dont fond time-->', response.url)
         return None
     wfunc.e('qq_news:' + items['title'])
     yield items
Example #8
0
from analyse.NLP.semantics import NB
from factory.data_analyse.marshal_cache import data_cache

cache_file = 'result_guide_topic.wooght'

# 报道量较高的前100名
# 基于2016年年底到2017年年底的所有报道

f_quotes = basedata()
all_company = companies.all_codenames()
all_codes = {}
for code in all_company:
    tmp_code = dict(code)
    all_codes[tmp_code['name']] = tmp_code['codeid']

start_day = wfunc.time_num('2017-11-06', '%Y-%m-%d')
end_day = wfunc.time_num('2018-01-08', '%Y-%m-%d')

# 上证指数日期
sz_quotes = f_quotes.select_quotes(1000001, True)
allow_dates = list(sz_quotes['datatime'])

# 筛选股票
ddpct = dd_pct()
ddpct.select_all()
allow_codes = ddpct.have_dd(260)

nb = NB()
# 停用词
nb.pass_words = {
    'x', 'm', 'url', 'nian', 'eng', 'nts', 'ntp', 'y', 'yue', 'nt', 'nr', 'j',