def start(self): self.initialization("info_sizechange", "jj_info_sizechange") source = self._webdriver.get(self.url) trs = source.find('div', id="gmbdtable").find("tbody").find_all('tr') amount = 0 for tr in trs: tds = tr.find_all('td') if len(tds) == 6: amount += 1 values = [self.code] date = tds[0].text for td in tds: values.append(td.text) sel = ("code", self.code, "date", date) self.storage(values, sel) else: pass printf("jj_info_sizechange storage ,CODE:%s,AMOUNT:%s" % (self.code, amount))
def start(self): self.initialization("info_holder_struct", "jj_info_holder_struct", PROJECT) source = self._webdriver.get(self.url) trs = source.find('div', id="cyrjgtable").find("tbody").find_all('tr') amount = 0 for tr in trs: tds = tr.find_all('td') if len(tds) == 5: amount += 1 values = [self.code] gonggao_date = tds[0].text for td in tds: values.append(td.text) sel = ("code", self.code, "gonggao_date", gonggao_date) self.storage(values, sel) else: pass printf("jj_info_holder_struct storage ,CODE:%s,AMOUNT:%s" % (self.code, amount))
def start(self): self.initialization("info_property_pz", "jj_info_property_pz") ### self.url = "http://fund.eastmoney.com/f10/zcpz_002423.html" ### source = self._webdriver.get(self.url) trs = source.find("table", class_="w782 comm tzxq").find("tbody").find_all('tr') amount = 0 for tr in trs: amount += 1 values = [self.code] tds = tr.find_all('td') report_date = tds[0].text for td in tds: values.append(td.text) sel = ("code", self.code, "report_date", report_date) self.storage(values, sel) printf("jj_info_property_pz storage ,CODE:%s,AMOUNT:%s" % (self.code, amount))
def start(self): self.initialization("info_fh", "jj_info_fh_peisong", PROJECT) source = self._webdriver.get(self.url) tags = source.find( 'table', class_='w782 comm cfxq').find('tbody').find_all('tr') amount = 0 for tag in tags: values = [self.code] infos = tag.find_all('td') day = infos[1].text values.append(infos[0].text) values.append(day) values.append(infos[2].text) values.append(get_digit(infos[3].text)) values.append(infos[4].text) sel = ('code', self.code, 'quanyidengji_date', day) self.storage(values, sel) amount += 1 printf("jj_info_fh_peisong storage ,CODE:%s,AMOUNT:%s" % (self.code, amount))
def start(self): self.initialization('info_history', 'jj_history_income') self._webdriver.get(self.url) # send_box = self._webdriver.find_element_by_xpath("//div[@id='pagebar']//input[1]") # botton = self._webdriver.find_element_by_xpath("//div[@id='pagebar']//input[@type='button']") next_botton = "//div[@id='pagebar']//label[last()]" current_xpath = "//div[@id='pagebar']//label[@class='cur']" pages_num = self._webdriver.find_element_by_xpath( "//div[@id='pagebar']//label[last()-1]").text current_page = self._webdriver.find_element_by_xpath( current_xpath).text amount = 0 while int(current_page) < int(pages_num): current_page = self._webdriver.find_element_by_xpath( current_xpath).text time.sleep(3) tags = BeautifulSoup( self._webdriver._brower.page_source, 'lxml').find( 'table', class_="w782 comm lsjz").find('tbody').find_all('tr') for tag in tags: tars = tag.find_all('td') values = [] values.append(self.code) values.append(tars[0].text) values.append(tars[1].text) values.append(tars[2].text) values.append(tars[3].text) values.append(tars[4].text) values.append(tars[5].text) values.append(tars[6].text) self._mysqlclient.storage( values, ('code', self.code, 'jz_date', tars[0].text)) amount += 1 botton = self._webdriver.find_element_by_xpath(next_botton) time.sleep(3) botton.click() time.sleep(3) printf('jj_history_income storage CODE:%s,AMOUNT:%s' % (self.code, amount))
def start(self): self.initialization("info_level", "jj_info_level") source = self._webdriver.get(self.url) tags = source.find('table', id='fundgradetable').find('tbody').find_all('tr') amount = 0 for tag in tags: values = [self.code] infos = tag.find_all('td') if len(infos) == 5: day = infos[0].text values.append(day) values.append(infos[1].text.count("★")) values.append(infos[2].text.count("★")) values.append(infos[3].text.count("★")) values.append(infos[4].text.count("★")) sel = ('code', self.code, 'pj_date', day) self.storage(values, sel) amount += 1 else: printf("jj_info_level", "Struction Error!") printf("jj_info_level storage,CODE:%s,AMOUNT:%s" % (self.code, amount))
def jj_info_company(self, source): # jj_info_company storage box1 = source.find('div', "first-block").find('tbody').find_all('tr') company = box1[0].find('td', class_="category-value").text value1 = [] value1.append(self.code) value1.append(company) value1.append(box1[1].find('td', class_="category-value").text) value1.append(box1[2].find('td', class_="category-value").text) value1.append(box1[3].find('td', class_="category-value").text) value1.append(box1[4].find( 'td', class_="category-value attached-value fixed-width").text) value1.append(box1[5].find('td', class_="category-value").text) value1.append(box1[6].find('td', class_="category-value").text) value1.append(box1[7].find('td', class_="category-value").text) value1.append(box1[8].find('td', class_="category-value").text) value1.append(box1[9].find('td', class_="category-value fixed-width").text) value1.append(box1[10].find('td', class_="category-value").text) value1.append(box1[11].find('td', class_="category-value").text.replace( ' ', '').replace('\n', '')) value1.append(box1[12].find( 'td', class_="category-value fixed-width").text.replace( ' ', '').replace('\n', '')) value1.append(box1[3].find( 'td', class_="category-value attached-value").text) value1.append(box1[4].find( 'td', class_="category-value attached-value").text) value1.append(box1[8].find( 'td', class_="category-value attached-value").text) value1.append(box1[9].find( 'td', class_="category-value attached-value").text) value1.append(box1[12].find( 'td', class_="category-value attached-value").text) sel1 = ('code', self.code, 'name', company) self.storage(self._mysqlclient[0], value1, sel1) printf("jj_info_company storage CODE:%s" % self.code) return company
def init(self): printf("Main Spider Initization!") self._webdriver = Webdriver() self._redisclient = RedisClient("info_basic_info") self._redisclient_code = RedisClient("info_basic_info" + "_code") self._mysqlclient = Mysql_Client("jj_basic_list") # self.initialization('Spider_basic_list', 'jj_basic_list') # Redis Server initialization self._redisclient.init() self._redisclient_code.init() init_list = [ 'info_manager', 'info_company', 'info_level', 'info_history', 'info_fh', 'info_holds', 'info_bets_holds', 'info_holds_trend', 'info_trade_pz', 'info_trade_compare', 'info_property_pz', 'info_changes', 'info_sizechange', 'info_holder_struct', 'info_all_gonggao', 'info_finance_target', 'info_property_bets', 'info_profit', 'info_income', 'info_cost', 'info_purchase_info', 'info_swich_info' ] for client in init_list: RedisClient(client).init() RedisClient(client + "_code").init()
def start(self): self.initialization("info_holds_trend", "jj_info_holds_trend") self._webdriver.get(self.url) next_botton = "//div[@id='pagebar']//label[last()]" current_xpath = "//div[@id='pagebar']//label[@class='cur']" pages_num = self._webdriver.find_element_by_xpath( "//div[@id='pagebar']//label[last()-1]").text current_page = self._webdriver.find_element_by_xpath( current_xpath).text amount = 0 while int(current_page) < int(pages_num): amount += 1 current_page = self._webdriver.find_element_by_xpath( current_xpath).text time.sleep(3) # tags = .find('div', id="wyhBody").find( # 'tbody').find_all('tr') botton = self._webdriver.find_element_by_xpath(next_botton) time.sleep(3) botton.click() time.sleep(3) printf("jj_info_bets_holds storage,CODE:%s,AMOUNT:%s" % (self.code, amount))
def jj_info_company_admin(self, company): # jj_info_company_admin storage next_botton = "//div[@id='gcglPager']//ul//li//a[@class='next ttjj-iconfont']" current_xpath = "//div[@id='gcglPager']//li[@class=' active']" pages_num = self._webdriver.find_element_by_xpath( "//div[@id='gcglPager']//li[last()-1]//a").text current_page = self._webdriver.find_element_by_xpath( current_xpath).text amount = 0 while int(current_page) < int(pages_num): current_page = self._webdriver.find_element_by_xpath( current_xpath).text time.sleep(3) tags = BeautifulSoup( self._webdriver._brower.page_source, 'lxml').find('div', id="gcglBody").find('tbody').find_all('tr') for index in range(int(len(tags[1:]) / 2)): values = [] values.append(company) info = tags[2 * index + 1].find_all('td') name = info[0].text values.append(name) values.append(info[1].text) values.append(info[2].text) values.append(info[3].text) values.append(tags[2 * index + 2].text.replace(' ', '').replace( '\n', '')) sel = ('company', company, 'name', name) self.storage(self._mysqlclient[2], values, sel) amount += 1 botton = self._webdriver.find_element_by_xpath(next_botton) time.sleep(3) botton.click() time.sleep(3) printf("jj_info_company_honor,CODE:%s,AMOUNT:%s" % (self.code, amount))
def jj_info_manager_changes(self, source): box1 = source.find('div', class_='box').find('table').find_all('tr') amount = 0 for tr in box1[1:]: value1 = [] value1.append(self.code) value1.append(tr.find_all('td')[0].text) value1.append(tr.find_all('td')[1].text) value1.append(tr.find_all('td')[2].text) value1.append(tr.find_all('td')[3].text) value1.append(tr.find_all('td')[4].text) sel1 = ('code', self.code, 'start_date', tr.find_all('td')[0].text, 'manager', tr.find_all('td')[2].text) self.storage(self._mysqlclient[0], value1, sel1) amount += 1 printf("jj_info_manager_changes storage,CODE:%s,AMOUNT:%s" % (self.code, amount)) # jj_info_manager_history storage box3 = source.find_all( 'table', class_='w782 comm jloff')[-1].find('tbody').find_all('tr') value3s = [] sel3s = [] for tr in box3: value3 = [] jjcode = tr.find_all('td')[0].text value3.append(jjcode) value3.append(tr.find_all('td')[1].text) value3.append(tr.find_all('td')[2].text) value3.append(tr.find_all('td')[3].text) value3.append(tr.find_all('td')[4].text) value3.append(tr.find_all('td')[5].text) value3.append(tr.find_all('td')[6].text) value3.append(tr.find_all('td')[7].text) value3.append(tr.find_all('td')[8].text) value3s.append(value3) sel3s.append(['code', jjcode, 'manager']) return value3s, sel3s
def start(self): self.initialization("info_all_gonggao", "jj_info_all_gonggao") self._webdriver.get(self.url) js = "window.open('%s');" # start next_botton = "//div[@id='pagebar']//label[last()]" current_xpath = "//div[@id='pagebar']//label[@value='%s']" pages_num = self._webdriver.find_element_by_xpath( "//div[@id='pagebar']//label[last()-1]").text current_page = 0 amount = 0 while int(current_page) < int(pages_num): current_page += 1 try: self._webdriver.find_element_by_xpath(current_xpath % current_page).click() except Exception as e: logging_except(e) self._webdriver.find_element_by_xpath(next_botton).click() time.sleep(3) table = BeautifulSoup( self._webdriver._brower.page_source, 'lxml').find('div', id='ggtable').find('tbody').find_all('tr') for tr in table: tds = tr.find_all('td') if len(tds) == 3: amount += 1 values = [self.code] title = tds[0].text.replace(' ', '').replace('\n', '') report_type = tds[1].text.replace(' ', '').replace('\n', '') date = tds[-1].text.replace(' ', '').replace('\n', '') href = tds[0].find('a')['href'] for td in [title, report_type, date]: values.append(td) self._webdriver._brower.execute_script(js % href) self._webdriver._brower.switch_to_window( self._webdriver._brower.window_handles[1]) time.sleep(3) while not BeautifulSoup( self._webdriver._brower.page_source, "lxml").find( 'pre', id='jjggzwcontentbody'): time.sleep(3) values.append( special_repace( BeautifulSoup(self._webdriver._brower.page_source, "lxml").find( 'pre', id='jjggzwcontentbody').text)) time.sleep(3) self._webdriver._brower.close() self._webdriver._brower.switch_to_window( self._webdriver._brower.window_handles[0]) sel = ("code", self.code, "title", title, 'date', date, 'type', report_type) self.storage(values, sel) amount += 1 else: pass # botton = self._webdriver.find_element_by_xpath(next_botton) # time.sleep(3) # botton.click() # time.sleep(3) # end printf("jj_info_all_gonggao storage ,CODE:%s,AMOUNT:%s" % (self.code, amount))
def db_initization(spider_name): # check if db exesists with pymysql.connect(**MYSQL_TMP) as cursor: cursor.execute("SHOW DATABASES;") DBs = [i[0] for i in cursor.fetchall()] if spider_name not in DBs: cursor.execute("CREATE DATABASE %s CHARSET='utf8';" % spider_name) printf("Create database:%s"%spider_name) # checking done MYSQL = MYSQL_conf(spider_name) with pymysql.connect(**MYSQL) as cursor: cursor.execute("SHOW TABLES;") Tables = cursor.fetchall() for table in Tables: cursor.execute("DROP TABLE %s;" % table) printf("DROP TABLE %s;" % table) printf("ALL TABLES CLEAR!") db_config = xlrd.open_workbook("db_struc/%s.xlsx" % spider_name) for sheet_name in db_config.sheet_names(): if sheet_name == 'target_urls': # 建立主表 sheet = db_config.sheet_by_name(sheet_name) # 新建表 nrows = sheet.nrows # ncols = sheet.ncols headers = [i.value for i in sheet.row(0)] SQL = "CREATE TABLE %s (`id` INT PRIMARY KEY AUTO_INCREMENT" % sheet_name for head in headers: SQL = SQL + ",`" + head + "` VARCHAR(255) " SQL += ");" with pymysql.connect(**MYSQL) as cursor: cursor.execute(SQL) printf("CREATE TABLE %s" % sheet_name) # 新建表 INSERT_SQL = "INSERT INTO %s (" % sheet_name + ','.join(["`%s`"] * len(headers)) % tuple( headers) + ") VALUES(" for i in range(1, nrows): # 插入数据 values = [i.value for i in sheet.row(i)] SQL = INSERT_SQL + ','.join(["'%s'"] * len(values)) % tuple(values) + ");" with pymysql.connect(**MYSQL) as cursor: cursor.execute(SQL) else: # 建立从表 sheet = db_config.sheet_by_name(sheet_name) # 新建表 nrows = sheet.nrows ncols = sheet.ncols comments = [i.value for i in sheet.col(0)[1:]] headers = [i.value for i in sheet.col(1)[1:]] SQL = "CREATE TABLE %s (`id` INT PRIMARY KEY AUTO_INCREMENT" % sheet_name for i in range(nrows - 1): SQL = SQL + ",`" + headers[i] + "` VARCHAR(255) COMMENT '" + comments[i] + "'" SQL += ");" with pymysql.connect(**MYSQL) as cursor: cursor.execute(SQL) printf("CREATE TABLE %s" % sheet_name) ext = ext_SQL.get(spider_name) if ext: with pymysql.connect(**MYSQL) as cursor: cursor.execute(ext) printf('Database intiazation completed!')