def __init__(self, codeid=None, first100=False, *args, **kwargs): super(Quotes_itemSpider, self).__init__(*args, **kwargs) self.select_data() if codeid == '0000001': # 上证指数 codes = [['0000001', 'szzs']] else: s = T.select([T.listed_company.c.codeid, T.listed_company.c.shsz]) if first100: var_dd = dd_pct() var_dd.select_all(common.wfunc.before_day(80)) code_100 = var_dd.have_dd(30) print(code_100) s = T.select([ T.listed_company.c.codeid, T.listed_company.c.shsz ]).where(T.listed_company.c.codeid.in_(code_100)) if codeid is not None: s = T.select([ T.listed_company.c.codeid, T.listed_company.c.shsz ]).where(T.listed_company.c.codeid == codeid) r = T.conn.execute(s) codes = r.fetchall() for item in codes: id = self.builde_code(item[0], item[1]) # 调整编码长度 self.start_urls.append(self.url_module % (str(id), self.startdata, self.enddata)) print('共需查询:' + str(len(self.start_urls)) + '支股票行情.......')
def select_quotes(self, id, getpd=True): # 行情查询 if (not getpd): r = T.select([T.quotes_item.c.quotes, T.quotes_item.c.update_at]).where(T.quotes_item.c.code_id == id) s = T.conn.execute(r) return s.fetchall()[0] r = T.select([T.quotes_item.c.quotes]).where(T.quotes_item.c.code_id == id) s = T.conn.execute(r) # json解析 item = s.fetchall()[0] obj = json.loads(item[0]) quotes = self.pd.DataFrame(obj) quotes = self.to_math(df=quotes, numeric=['gao', 'di', 'shou', 'kai', 'before', 'zd_range', 'zd_money', 'liang']) return quotes
def select_cp_atd(self, Tb, a_type): #文章cp_attitude查询 s = T.select( [Tb.c.put_time, Tb.c.cp_attitude]).where(Tb.c.code_id == self.code_id).where( Tb.c.put_time > self.start).where(Tb.c.article_type == a_type) pddata = self.select_atd(s, columns=['cp_attitude']) return pddata
def select_plates(self): s = T.select([T.listed_plate.c.plateid ]).where(T.listed_plate.c.father_id != 0) r = T.conn.execute(s) for pid in r.fetchall(): plateid = pid[0] if (len(str(plateid)) < 5): plateid = '0' + str(plateid) self.plates.append(plateid) s = T.select([T.listed_company.c.codeid]) r = T.conn.execute(s) for code in r.fetchall(): self.companys.append(str(code[0])) s = T.select([T.listed_region.c.id, T.listed_region.c.name]) r = T.conn.execute(s) for region in r.fetchall(): region_dict = {} self.regions[region[1]] = region[0]
def select_cp_atd(self, Tb, a_type): # 文章cp_attitude查询 if str(self.code_id) == '1000001': where = Tb.c.code_id is not None else: where = Tb.c.code_id == self.code_id s = T.select([Tb.c.put_time, Tb.c.cp_attitude]).where(where).where( Tb.c.put_time > self.start).where(Tb.c.article_type == a_type) pddata = self.select_atd(s, columns=['cp_attitude']) return pddata
def make_keywords(txt_path): s = T.select([T.ch_dict.c.word, T.ch_dict.c.rate, T.ch_dict.c.nature]) r = T.conn.execute(s) f = open(txt_path, 'w', encoding='utf-8') for i in r.fetchall(): tmp = dict(i) f.write(i['word'] + ' ' + str(int(i['rate'])) + ' ' + i['nature'] + '\n') f.close()
def __init__(self, *args, **kwargs): # 调用父类沟站函数 super(TopicVreport, self).__init__(*args, **kwargs) # 查询已经存在的地址 s = T.select([T.topic.c.url]) r = T.conn.execute(s) arr = r.fetchall() for one in arr: self.old_link.append(one[0])
def thecompany(): s = T.select([T.listed_company.c.name, T.listed_company.c.id]) r = T.conn.execute(s) companies = update_ab(r) for cp in companies: u = T.listed_company.update().where( T.listed_company.c.id == cp[1]).values(name=cp[0]) r = T.conn.execute(u) if r.rowcount == 1: print(cp[0], '修改成功...')
def thechdict(): s = T.select([T.ch_dict.c.word, T.ch_dict.c.id]).where(T.ch_dict.c.rate == '100000.00') r = T.conn.execute(s) new_dicts = update_ab(r) for cp in new_dicts: u = T.ch_dict.update().where(T.ch_dict.c.id == cp[1]).values( word=cp[0]) r = T.conn.execute(u) if r.rowcount == 1: print(cp[0], '修改成功...')
def add_dict(): ch_s = T.select([T.ch_dict.c.word]).where(T.ch_dict.c.rate == '100000.00') ch_r = T.conn.execute(ch_s) ch_dict = [] for i in ch_r.fetchall(): ch_dict.append(i[0]) print(ch_dict) cp_s = T.select([T.listed_company.c.name]) cp_r = T.conn.execute(cp_s) new_dict = [] for n in cp_r.fetchall(): if n[0] not in ch_dict: tmp = {} tmp['word'] = n[0] tmp['rate'] = '100000' tmp['nature'] = 'nts' new_dict.append(tmp) i = T.ch_dict.insert() r = T.conn.execute(i, new_dict) if r.rowcount > 0: print(r.rowcount, '新添加')
def start_requests(self): s = T.select([T.listed_plate.c.plateid], ).where( T.listed_plate.c.plateid > 4000).where( T.listed_plate.c.father_id > 0) r = T.conn.execute(s) metas = [] for i in r.fetchall(): metas.append(str(i[0])) print(metas) return [ Request(self.start_urls[0], meta={'plates': metas}, callback=self.parse) ] #请求网页,并把cookie保存在meta中
def open_spider(self, spider): wfunc.e('spider ' + spider.name + ' --->opend') if spider.name in ['ddtj', 'ddtj_history']: s = T.select([T.ddtj.c.only_id]) r = T.conn.execute(s) ddtj_onlyid = [] for item in r.fetchall(): ddtj_onlyid.append(item[0]) self.ddtj_onlyid = ddtj_onlyid if spider.name == 'xueqiu_zuhe': s = T.select([T.xq_zuhe.c.zh_symbol]) r = T.conn.execute(s) zh_list = [] for item in r.fetchall(): zh_list.append(item[0]) self.zh_list = zh_list if spider.name == 'zuhe_change': s = T.select([T.zuhe_change.c.change_id]) r = T.conn.execute(s) change_list = [] for item in r.fetchall(): change_list.append(item[0]) self.change_list = change_list
def __init__(self, *args, **kwargs): super(QuotesSpider, self).__init__(*args, **kwargs) self.select_data() s = T.select([T.listed_company.c.codeid ]).where(T.listed_company.c.codeid < 10) r = T.conn.execute(s) for item in r.fetchall(): id = str(item[0]) # 调整编码长度 if (len(id) < 6): while len(id) < 6: id = '0' + id id = '1' + id self.start_urls.append(self.url_module % (str(id), self.startdata, self.enddata)) print('共需查询:', len(self.start_urls), '支股票行情.......')
def select_change(self): quotes_data = self.select_quotes(code_id) s = T.select([T.zuhe_change]).where(T.zuhe_change.c.code_id == code_id) r = T.conn.execute(s) if (r.rowcount < 1): return '' data_arr = [] for i in r.fetchall(): # 数据库查询得到字典 i = dict(i) i['updated_at'] = wfunc.the_day(int(int(i['updated_at']) / 1000)) data_arr.append(i) pandas_change = self.pd.DataFrame(data_arr) pandas_change['datatime'] = self.pd.to_datetime( pandas_change['updated_at'], format='%Y-%m-%d') pd_mean = pandas_change.groupby('datatime', as_index=False)['change_status'].agg( {'change_status': 'sum'}) pd_count = pandas_change.groupby('datatime', as_index=False)['change_status'].agg( {'change_count': 'count'}) quotes_data['datatime'] = self.pd.to_datetime(quotes_data['datatime'], format='%Y-%m-%d') del pandas_change['updated_at'] pd_mean = self.pd.merge(quotes_data, pd_mean, on=['datatime'], how='left').fillna(0) pd_mean = self.pd.merge(pd_mean, pd_count, on=['datatime'], how='left').fillna(0) pd_mean = pd_mean.sort_values(by='datatime', ascending=True) # pd_mean['change_status'].fillna(method='pad') # 用前值填充NaN值 pd_mean['cumsum'] = pd_mean['change_status'].cumsum() # cumsum累加 前值的和 result = self.web_data(pd_mean, 'datatime', columns=['cumsum', 'shou', 'change_count']) return result
def process_item(self, item, spider): # 提取时间过滤 # if('put_time' in dict(item)): # if(float(item['put_time'])<self.min_time): # return None self.add_nums += 1 # 新闻文章 if (isinstance(item, NewsItem)): # 去除body中的html标签 item['body'] = wfunc.delete_html(item['body']) s = T.select([T.news]).where(T.news.c.only_id == item['only_id']) r = T.conn.execute(s) if (r.rowcount > 0): return None i = T.news.insert() r = T.conn.execute(i, dict(item)) # 语义分析 att_item = item att_item['article_id'] = r.inserted_primary_key att_item['article_type'] = 2 result = self.news_analyse.run(att_item) self.add_attitude_relation(result) # 专题分析文章 elif (isinstance(item, TopicItem)): # 去除body中的html标签 item['body'] = wfunc.delete_html(item['body']) s = T.select([T.topic]).where(T.topic.c.only_id == item['only_id']) r = T.conn.execute(s) if (r.rowcount > 0): return None i = T.topic.insert() r = T.conn.execute(i, dict(item)) # 语义分析 att_item = item att_item['article_id'] = r.inserted_primary_key att_item['article_type'] = 1 result = self.topic_analyse.run(att_item) self.add_attitude_relation(result) # 股票代码 elif (isinstance(item, CodesItem)): if (spider.name in ['codes', 'newcodes']): s = T.select([ T.listed_company ]).where(T.listed_company.c.codeid == item['codeid']) r = T.conn.execute(s) if (r.rowcount > 0): return None i = T.listed_company.insert() r = T.conn.execute(i, dict(item)) elif (spider.name == 'upplates'): u = T.listed_company.update().where( T.listed_company.c.codeid == item['codeid']).values( plate_id=item['plate_id']) r = T.conn.execute(u) # 股票行情 elif (isinstance(item, QuotesItem)): i = T.quotes.insert() r = T.conn.execute(i, dict(item)) # 股票板块 elif (isinstance(item, PlatesItem)): s = T.select([T.listed_plate ]).where(T.listed_plate.c.plateid == item['plateid']) r = T.conn.execute(s) if (r.rowcount > 0): return None i = T.listed_plate.insert() r = T.conn.execute(i, dict(item)) # 公司公告 elif (isinstance(item, NoticesItem)): s = T.select([ T.company_notice ]).where(T.company_notice.c.title == item['title']).where( T.company_notice.c.code_id == item['code_id']) r = T.conn.execute(s) if (r.rowcount > 0): return None i = T.company_notice.insert() r = T.conn.execute(i, dict(item)) # 问答 elif (isinstance(item, QandaItem)): s = T.select([T.qanda.c.id ]).where(T.qanda.c.only_id == item['only_id']) r = T.conn.execute(s) if (r.rowcount > 0): return None i = T.qanda.insert() r = T.conn.execute(i, dict(item)) return None