def parse_zxg_apinews(self, response): items = NewsItem() api = self.get_json(response) items['title'] = api['data']['title'] items['only_id'] = api['data']['id'] items['body'] = api['data']['content']['text'] items['put_time'] = wfunc.time_num(api['data']['id'][:8], "%Y%m%d") items['url'] = api['data']['surl'] yield items
def parse_yicai(self, response): items = NewsItem() items['title'] = response.xpath('//head/title/text()').extract()[0].strip() thetime = response.xpath('//div[@class="m-title f-pr"]/h2//span[2]/text()').extract()[0].strip() items['put_time'] = wfunc.time_num(thetime, "%Y-%m-%d %H:%M") items['url'] = response.url h_num = re.search(r'\/(\d+)\.html', items['url'], re.I).group(1) items['only_id'] = h_num items['body'] = response.xpath('//div[@class="m-text"]').extract()[0].strip() wfunc.e('yicai_news:'+items['title']) yield items
def parse_vreport(self, response): items = TopicItem() items['title'] = response.xpath('//h1/text()').extract_first().strip() thetime = response.xpath( '//div[@class="creab"]/span[4]/text()').extract()[0].strip() thetime = time_num(thetime.split(':')[1], '%Y-%m-%d') items['put_time'] = thetime url_re = re.search(r'\/(\d+)\/index\.phtml$', response.url, re.I) items['url'] = response.url items['only_id'] = url_re.group(1) items['body'] = response.xpath( '//div[@class="blk_container"]').extract()[0].strip() e('sina_topic:' + items['title']) yield items
def parse_topic(self,response): items = TopicItem() thetime = response.xpath('//span[@class="timer"]/text()').extract_first().strip()[:10] items['put_time'] = wfunc.time_num(thetime,"%Y-%m-%d") items['title'] = response.xpath('//h1[@class="title"]/text()').extract_first().strip() items['body'] = '' body = response.xpath('//div[@id="qmt_content_div"]//p/text()').extract() for str in body: items['body']+=str items['url'] = response.url url_re = re.search(r'.*\/company\/scp_ggjd\/tjd_bbdj\/(\d+)\/(\d+).htm$',items['url'],re.I) #http://ggjd.cnstock.com/company/scp_ggjd/tjd_bbdj/201604/3765369.htm items['only_id'] = url_re.group(1)+url_re.group(2) wfunc.e('cnstock_topics:'+items['title']) yield items
def parse_jrj(self, response): items = TopicItem() items['title'] = response.xpath( '//div[@class="titmain"]/h1/text()').extract()[2].strip() thetime = response.xpath( '//p[@class="inftop"]//span[1]/text()').extract_first().strip() thedata = thetime.split(" ") items['put_time'] = wfunc.time_num(thedata[0], '%Y-%m-%d') url_re = re.search(r'.*\/(\d+)\/(\d+)\/(\d+)\.shtml$', response.url, re.I) items['url'] = response.url items['only_id'] = url_re.group(3) items['body'] = response.xpath( '//div[@class="texttit_m1"]').extract()[0].strip() wfunc.e('jrj_topic:' + items['title']) yield items
def parse_163_money(self, response): # http://money.163.com/17/1114/13/D375MGIB0025814V.html items = NewsItem() items['title'] = response.xpath('//div[@id="epContentLeft"]/h1[1]/text()').extract()[0].strip() bodys = response.xpath('//div[@id="endText"]//p').extract() body_str = '' for ii in bodys: body_str += ii.strip() items['body'] = body_str items['url'] = response.url url_re = re.search(r'.*\.163\.com\/\d+\/\d+\/\d+\/(\w*)\.html$', items['url'], re.I) items['only_id'] = url_re.group(1) thetime = response.xpath('//div[@class="post_time_source"]/text()').extract_first().strip() items['put_time'] = wfunc.time_num(thetime[:16], "%Y-%m-%d %H:%M") wfunc.e('163_news:' + items['title']) yield items
def parse_qq_ywq(self, response): # http://stock.qq.com/a/20171107/017324.htm items = NewsItem() items['title'] = response.xpath('//title/text()').extract()[0].strip() bodys = response.xpath('//div[@id="Cnt-Main-Article-QQ"]//p').extract() # 得到的是列表 body_str = '' for ii in bodys: body_str += ii.strip() items['body'] = body_str items['url'] = response.url url_re = re.search(r'.*a\/(\d+)\/(\d+).htm', items['url'], re.I) items['only_id'] = url_re.group(1) + url_re.group(2) thetime = response.xpath('//span[@class="a_time"]/text()') if (len(thetime) < 1): thetime = response.xpath('//span[@class="pubTime article-time"]/text()') try: items['put_time'] = wfunc.time_num(thetime.extract()[0].strip(), "%Y-%m-%d %H:%M") except IndexError as e: print('IndexError:dont fond time-->', response.url) return None wfunc.e('qq_news:' + items['title']) yield items
from analyse.NLP.semantics import NB from factory.data_analyse.marshal_cache import data_cache cache_file = 'result_guide_topic.wooght' # 报道量较高的前100名 # 基于2016年年底到2017年年底的所有报道 f_quotes = basedata() all_company = companies.all_codenames() all_codes = {} for code in all_company: tmp_code = dict(code) all_codes[tmp_code['name']] = tmp_code['codeid'] start_day = wfunc.time_num('2017-11-06', '%Y-%m-%d') end_day = wfunc.time_num('2018-01-08', '%Y-%m-%d') # 上证指数日期 sz_quotes = f_quotes.select_quotes(1000001, True) allow_dates = list(sz_quotes['datatime']) # 筛选股票 ddpct = dd_pct() ddpct.select_all() allow_codes = ddpct.have_dd(260) nb = NB() # 停用词 nb.pass_words = { 'x', 'm', 'url', 'nian', 'eng', 'nts', 'ntp', 'y', 'yue', 'nt', 'nr', 'j',