def get_stock_smt_cfg_en_name():
#    en_list = []

    cfg_exist = os.path.isfile("c:/scrapy/stockSmtCfg.xml")
    print('c:/scrapy/stockSmtCfg.xml is', cfg_exist)
    if cfg_exist:
        print('c:/scrapy/stockSmtCfg.xml is already exist')
    else:
       create_stock_smt_cfg()

    tree = ET.parse(globalPars.getToolBaseWorkDirectory() + "stockSmtCfg.xml")     #打开xml文档
    root = tree.getroot()         #获得root节点  '
   
    sNode = root.find('yidian')
    if( sNode.find('enable').text == 'Enable' ) :
#        en_list.append('yidian')
        return 'yidian'
    
    sNode = root.find('tonghuashun')
    if( sNode.find('enable').text == 'Enable' ) :
#        en_list.append('tonghuashun')
        return 'tonghuashun'

    sNode = root.find('dazhihui')
    if( sNode.find('enable').text == 'Enable' ) :
#        en_list.append('dazhihui')
        return 'dazhihui'
        
        
    sNode = root.find('xinlang')
    if( sNode.find('enable').text == 'Enable' ) :
#        en_list.append('xinlang')
        return 'xinlang'
def generateThsUrls():
    urls = []

    dict = generate_stock_dic()
    for key in dict.keys():
        urls.append('http://stockpage.10jqka.com.cn/' + key + '/holder/')

    #print(urls)
    with open(globalPars.getToolBaseWorkDirectory() + 'ths_urls.txt',
              'wb') as f:
        for url in urls:
            f.write(url.encode('utf-8'))
            f.write('\n'.encode('utf-8'))
def generateXlUrls():
    urls = []

    dict = generate_stock_dic()
    for key in dict.keys():
        # http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CirculateStockHolder/stockid/603009.phtml#2010-09-30
        urls.append(
            'http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CirculateStockHolder/stockid/'
            + key + '.phtml#2010-09-30')

    #print(urls)
    with open(globalPars.getToolBaseWorkDirectory() + 'xl_urls.txt',
              'wb') as f:
        for url in urls:
            f.write(url.encode('utf-8'))
            f.write('\n'.encode('utf-8'))
def generateDzhUrls():
    urls = []
    print('begin generateDzhUrls')
    dict = generate_stock_dic()
    for key in dict.keys():
        if key.isdigit():
            if int(key) < 600000:
                urls.append('http://webf10.gw.com.cn/SZ/B10/SZ' + key +
                            '_B10.html')
            else:
                urls.append('http://webf10.gw.com.cn/SH/B10/SH' + key +
                            '_B10.html')

    #print(urls)
    with open(globalPars.getToolBaseWorkDirectory() + 'dzh_urls.txt',
              'wb') as f:
        for url in urls:
            f.write(url.encode('utf-8'))
            f.write('\n'.encode('utf-8'))
def create_stock_smt_cfg():
    print('create_stock_smt_cfg')
    doc = Dom.Document()  
    root_node = doc.createElement("stock_crawl_cfg")  
    root_node.setAttribute("developer", "zhonghua")  
    doc.appendChild(root_node)  
    yiDianNode = create_yidian_cfg(doc)
    thsNode = create_ths_cfg(doc)
    dzhNode = create_dzh_cfg(doc)
    xinlangNode = create_xinlang_cfg(doc)
    
    root_node.appendChild(yiDianNode)
    root_node.appendChild(thsNode) 
    root_node.appendChild(dzhNode) 
    root_node.appendChild(xinlangNode) 
    
    with open(globalPars.getToolBaseWorkDirectory() + "stockSmtCfg.xml", "w")  as f:
        str = doc.toprettyxml(indent = "\t", newl = "\n", encoding = "utf-8")
        f.write(bytes.decode(str))
def generateYiDianGDUrls(stock_number_list):
    urls = []
    with open('date_cfg.cfg', 'r') as f:
        dates = f.readlines()
        print(dates)

    for date in dates:
        for number in stock_number_list:
            a_date = date.strip()
            new_url = 'http://www.yidiancangwei.com/gudong/sdlt_' + number + '_' + a_date + '.html'
            new_url = str(new_url)
            urls.append(new_url)

    #print(urls)
    with open(globalPars.getToolBaseWorkDirectory() + 'yidian_urls.txt',
              'wb') as f:
        for url in urls:
            f.write(url.encode('utf-8'))
            f.write('\n'.encode('utf-8'))
    return urls
class DzhgudongSpider(scrapy.Spider):
	name = "dzh"
	allowed_domains = ['webf10.gw.com.cn']
	start_urls = [
		# 'http://webf10.gw.com.cn/SH/B10/SH600767_B10.html#'
	]

	with open(globalPars.getToolBaseWorkDirectory() + 'dzh_urls.txt', 'r') as f:
		for line in f.readlines():
			start_urls.append(line.strip())
	
	self_dict = generate_stock_dic()
	#print(start_urls)

	#流通股东数据
	def checkLTGD(self, response):
		sel = scrapy.selector.Selector(response)
		# <section class="sdgdBox dzh_index" id="十大流通股东">
		LTGD_body = sel.xpath('//section[@id="十大流通股东"]')
		body = sel.xpath('//nav[@class="sdgd_nav clearfix"]')
		dateList = body.xpath('.//a/text()').extract()
		# print('dateList = ', dateList)
		h2 = LTGD_body.xpath('.//h2/text()').extract()
		# print('h2 = ', h2)

		all_info = []
		#date_length = len(date_time)
		
		cur_seq = 0
		
		one_jidu_info_divs = LTGD_body.xpath('.//div[@class="sdgdCon  sdltgdTb"]')
		jd_parts = len(one_jidu_info_divs)
		# print("gudong info parts %d" % len(one_jidu_info_divs))
		for one_jidu_info in one_jidu_info_divs:
			#<table class="f10tabel sdgd_table">
			one_gudong_info_divs = one_jidu_info.xpath('.//table[@class="f10tabel sdgd_table"]')
			#print("gudong info member %d" % len(one_gudong_info_divs))
			#for one_gudong_info in one_gudong_info_divs:
			name = one_gudong_info_divs.xpath('.//td/a/text()').extract()
			info = one_gudong_info_divs.xpath('.//td/text()').extract()
			# print('---------------------------------------------------------')
			name_len = len(name) + 1

			if (name and info):
				for i in range(1,name_len):
					# print('cur = ', i)
					"""
					'1.', '5350.83', '未变', '15.70', '流通A股',
					"""
					gdname = name[i-1]
					hold_num = info[5*(i-1) + 1]
					change = info[5*(i-1) + 2]
					percent = info[5*(i-1) + 3]
					stock_type = info[5*(i-1) + 4]
					date = dateList[-jd_parts + cur_seq]
					stock_number = self.stock_num
					Cname = self.self_dict[stock_number]
					#gdinfo = (stock_number, stock_name,  gdname,  hold_num,  percent, date,  stock_type)
					gdinfo = (stock_number, Cname,  gdname,  hold_num,  percent, change, date, stock_type)
					#print('gdinfo = ', gdinfo)
					all_info.append(gdinfo)
			cur_seq += 1
		
		# print('all_info = ', all_info)
		store_lt_gudong_db(all_info)
	
	
	def parse(self, response):
		pattern = re.compile(r'60[0-3]\d{3}|00[0,2]\d{3}|300\d{3}') 
		match = pattern.search(str(response)) 
		if match:
			self.stock_num = match.group()
			print("stock_num =", str(self.stock_num))
			self.checkLTGD(response)
class XlgudongSpider(scrapy.Spider):
	name = "xl"
	allowed_domains = ['finance.sina.com.cn']
	start_urls = [
		# 'http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CirculateStockHolder/stockid/600079.phtml#2010-09-30'
		# 'http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CirculateStockHolder/stockid/300521.phtml#2010-09-30'
	]
	
	self_dict = update_stock.generate_stock_dic()
	print("dic len = ", len(self_dict))
	
	with open(globalPars.getToolBaseWorkDirectory() + 'xl_urls.txt', 'r') as f:
		for line in f.readlines():
			start_urls.append(line.strip())
			
	def checkLTNumber(self, response):
		pass
	
	def reCalcGDData(self, times, all_info):
		for j in range(times):
			jidu_new_div = all_info[j]
			if (j+1) == times:
				return
			jidu_old_div = all_info[j+1]
			# 从一个季度里拿出每个股东数据和上衣季度对比
			for i in range(len(jidu_new_div)):
				find = 0
				new_gd_info = jidu_new_div[i]
				# 当前股东信息和上一个季度每个股东对比
				new_val = int(new_gd_info[3]) / 10000.0
				for k in range(len(jidu_old_div)):
					old_gd_info = jidu_old_div[k]
					# 如果股东在上个季度存在, 则计算差值
					find = 0
					old_val = int(old_gd_info[3]) / 10000.0
					if new_gd_info[2] == old_gd_info[2] :
						find = 1
						if new_gd_info[3] == old_gd_info[3]:
							new_gd_info[-3] = u'不变'
						else:
							val = '%.2f' % (new_val - old_val)
							new_gd_info[-3] = str(val)
							# new_gd_info[3]	= '%.2f' % new_val
							# 找到了就停止本次循环
						break
				
				if find == 0:
					new_gd_info[-3] = u'新进'
					
				new_gd_info[3]	= '%.2f' % new_val
		
	def checkLTGD(self, response):
		sel = scrapy.selector.Selector(response)

		# <td colspan="5" align="left" class="tdr">2011-09-30</td>
		date_list_sel = sel.xpath('//td[@class="tdr"]')
		date_list_txt = date_list_sel.xpath('.//text()').extract()
		# print(date_list_txt)
		# print('date_list_txt len = ', len(date_list_txt))
				
		date_cur = 0
		
		#for data_list_s in date_list_sel:
		# <div align="center">上海机场(集团)有限公司</div>
		tbody = sel.xpath('//div[@align="center"]')
		tbody_td_list = tbody.xpath('.//text()').extract()
		# print(tbody_td_list)
		length = len(tbody_td_list)
		all_info = []
		jidu_info = []
		ret_info_list = []
		name = self.self_dict[self.stock_num]
		# print('length = ', length)
		for i in range(length):
			# print('i = %d date_cur=%d' % (i, date_cur))
			if tbody_td_list[i].isdigit() and int(tbody_td_list[i]) < 13 and date_cur < len(date_list_txt) and i+3 < length and '.' in tbody_td_list[i+3] :
				# (stock_number varchar(20), cname varchar(20),gudong_name varchar(100), hold_num varchar(20), percent varchar(20), ichange varchar(20), date varchar(20), stock_type varchar(20))
				gd_info = [self.stock_num, name, tbody_td_list[i+1], tbody_td_list[i+2], tbody_td_list[i+3], '-', date_list_txt[date_cur], '流通A股']
				jidu_info.append(gd_info)
				# print(gd_info)
				if (i+5 < length) and tbody_td_list[i+5].isdigit() == False :
					all_info.append(jidu_info)
					jidu_info = [] 
					date_cur += 1
				i += 3
			i += 1
		
		# 信息保存到文件中
		# with open('temp.txt', 'w') as f:
			# i = 0
			# for info in all_info:
				# cmd = '\n %d  %s ---------------------------\n' % (i, date_list_txt[i])
				# f.writelines(cmd)
				# number = 0
				# for l in info:
					# f.writelines(l)
					# f.writelines('\n')
					# number += 1
				# cmd2 = '******************%d***********' % number
				# f.writelines(cmd2)
				# i += 1
					
		# print('all_info len = ', len(all_info))
		# jidu_ptr = 0
		cacl_times = len(all_info) - 1
		
		self.reCalcGDData(cacl_times, all_info)
		# print('cacl_times =', cacl_times)
		# for j in range(cacl_times):
			# jidu_new_div = all_info[j]
			# if (j+1) == cacl_times:
				# return
			# jidu_old_div = all_info[j+1]
			# # 从一个季度里拿出每个股东数据和上衣季度对比
			# for i in range(len(jidu_new_div)):
				# find = 0
				# new_gd_info = jidu_new_div[i]
				# # 当前股东信息和上一个季度每个股东对比
				# new_val = int(new_gd_info[3]) / 10000.0
				# for k in range(len(jidu_old_div)):
					# old_gd_info = jidu_old_div[k]
					# # 如果股东在上个季度存在, 则计算差值
					# find = 0
					# old_val = int(old_gd_info[3]) / 10000.0
					# if new_gd_info[2] == old_gd_info[2] :
						# find = 1
						# if new_gd_info[3] == old_gd_info[3]:
							# new_gd_info[-3] = u'不变'
						# else:
							# val = '%.2f' % (new_val - old_val)
							# new_gd_info[-3] = str(val)
							# # new_gd_info[3]	= '%.2f' % new_val
							# # 找到了就停止本次循环
						# break
				
				# if find == 0:
					# new_gd_info[-3] = u'新进'
					
				# new_gd_info[3]	= '%.2f' % new_val
				# ret_info_list.append(new_gd_info)
				
		# print('---------------------------------------------------')
		# print('all_info = ', all_info)
		
		# print(all_info)
		# print('all_info len = ', len(all_info))
		# 信息保存到文件中
		# with open('over.txt', 'w') as f:
			# i = 0
			# for info in all_info:
				# cmd = '\n %d  %s ---------------------------\n' % (i, date_list_txt[i])
				# f.writelines(cmd)
				# number = 0
				# for l in info:
					# f.writelines(l)
					# f.writelines('\n')
					# number += 1
				# cmd2 = '******************%d***********' % number
				# f.writelines(cmd2)
				# i += 1
				
		# print(all_info)
		for info in all_info:
			stockSql.store_old_lt_gudong_db(info)
				

	def parse(self, response):
		pattern = re.compile(r'60[0-3]\d{3}|00[0,2]\d{3}|300\d{3}') 
		match = pattern.search(str(response)) 
		if match:
			self.stock_num = match.group()
			self.checkLTGD(response)
Example #9
0
class ThsgudongSpider(scrapy.Spider):
    name = "ths"
    allowed_domains = ['10jqka.com.cn']
    start_urls = [
        # 'http://stockpage.10jqka.com.cn/300575/holder/'
        #'http://stockpage.10jqka.com.cn/300371/holder/'
    ]

    with open(globalPars.getToolBaseWorkDirectory() + 'ths_urls.txt',
              'r') as f:
        for line in f.readlines():
            start_urls.append(line.strip())

    #print(start_urls)

    #股东人数披露

    def checkLTNumber(self, response):
        all_info = []
        sel = scrapy.selector.Selector(response)
        # <div class="td_w">4.73</div>
        data_list = sel.xpath('//div[@class="td_w"]/text()').extract()
        # print('len = ', len(data_list))
        length = len(data_list)
        half = (length // 2) - 1
        date_list = data_list[0:half]
        num_list = data_list[half + 1:length - 1]

        body = sel.xpath('//table[@class="tbody"]')

        info_list_text = body.xpath('.//td').extract()

        for i in range(half):
            # ret1 = info_list_text[i].replace('</td>', '').replace('</div>', '').split('>')[-1]
            number = num_list[i]
            ichange_percent = info_list_text[i + half + 1].replace(
                '</td>', '').replace('</span>', '').split('>')[-1]
            date = date_list[i]
            gd_num_info = (self.stock_num, self.cName, number, ichange_percent,
                           date)
            # print(gd_num_info)
            all_info.append(gd_num_info)
        # print(all_info)
        stockSql.store_lt_gudong_num_db(all_info)

    #和上期比较持股数量变化
    def checkIchange(self, str):
        ret = str.replace('\r\n', '').replace('\t', '').replace(
            '</s>', '').replace('<s>', '').replace('<td>', '').replace(
                '</td>', '').strip().replace(' ',
                                             '').replace('spanclass=',
                                                         '').split('</span>')
        if (len(ret) == 1):
            str = ret[0]
        else:
            str_temp1 = ret[0].split('>')
            if '.' in str_temp1[-1]:
                #            print(str_temp1[-1])
                str_tmp2 = str_temp1[-2].replace('<', '').replace('"', '')
                #            print(str_tmp2)
                if (str_tmp2 == 'upcolor'):
                    str = '+' + str_temp1[-1]
                else:
                    str = str_temp1[-1]
            else:
                str = str_temp1[-1]
        return str

    #流通股东数据
    def checkLTGD(self, response):
        sel = scrapy.selector.Selector(response)
        LTGD_body = sel.xpath('//div[@id="flowholder"]')
        date_time = LTGD_body.xpath(
            './/li/a[@class="fdates"]/text()').extract()
        # print(date_time)
        all_info = []
        date_length = len(date_time)
        if date_time:
            ret = stockSql.query_lt_gudong_info_exist(self.stock_num,
                                                      date_time[0])
            # print('query stock(%s) date(%s) ret %s' % (self.stock_num, date_time[0], ret))
            if ret:
                return 0

        cur_seq = 0
        one_jidu_info_divs = LTGD_body.xpath(
            './/table[@class="m_table m_hl ggintro"]')
        # print("gudong info parts %d" % len(one_jidu_info_divs))
        for one_jidu_info in one_jidu_info_divs:
            one_gudong_info_divs = one_jidu_info.xpath('.//tbody/tr')
            # print("gudong info member %d" % len(one_gudong_info_divs))
            # print(one_gudong_info_divs)
            for one_gudong_info in one_gudong_info_divs:
                name = one_gudong_info.xpath(
                    './/th[@class="tl holder_th"]/a/text()').extract()
                info = one_gudong_info.xpath('.//td/text()').extract()
                # print(info)
                info_list = one_gudong_info.xpath('.//td')
                # print('info_list = ', info_list)
                info_list_text = one_gudong_info.xpath('.//td').extract()
                # print('name = ', name)
                # print('info_list_text = ', info_list_text)
                if name and info_list_text and info_list:
                    stock_number = self.stock_num
                    Cname = self.cName
                    gdname = name[0]
                    hold_num = info_list[0].xpath('.//text()').extract()[0]
                    # percent = info_list[2].xpath('.//text()').extract()
                    percent = info[2]
                    if percent.strip() == '':
                        percent = '-'
                    ichange = self.checkIchange(info_list_text[1])
                    date = date_time[cur_seq]
                    stock_type = info_list[4].xpath('.//text()').extract()[0]
                    gdinfo = (stock_number, Cname, gdname, hold_num, percent,
                              ichange, date, stock_type)
                    all_info.append(gdinfo)
                    # print(gdinfo)
            cur_seq += 1

        stockSql.store_lt_gudong_db(all_info)

    def checkCName(self, response):
        sel = scrapy.selector.Selector(response)
        self.cName = sel.xpath('//strong/text()').extract()[0]
        # print('cName = ', self.cName)

    def parse(self, response):
        pattern = re.compile(r'60[0-3]\d{3}|00[0,2]\d{3}|300\d{3}')
        match = pattern.search(str(response))
        if match:
            self.stock_num = match.group()
            # print("stock_num =", str(self.stock_num))
            self.checkCName(response)
            self.checkLTNumber(response)
            self.checkLTGD(response)