def setupOgre(self, pluginCfgPath="./Plugins.cfg", ogreCfgPath="./ogre.cfg", logPath="./ogre.log"): if platform.system() == "Windows": pluginCfgPath="./Plugins-windows.cfg" else: pluginCfgPath="./Plugins-linux.cfg" root = og.Root(pluginCfgPath, ogreCfgPath, logPath) self.ogreRoot = root if not self.ogreRoot.restoreConfig() and not self.ogreRoot.showConfigDialog(): sys.exit('Quit from Config Dialog') root.initialise(False) self.pivotRenderQueueListener = PivotRenderQueueListener() self.OgreMainWinSceneMgr = self.ogreRoot.createSceneManager(og.ST_GENERIC, "OgreMainWinSceneMgr") self.OgreMainWinSceneMgr.ambientLight = og.ColourValue(4, 4, 4) self.OgreMainWinSceneMgr.addRenderQueueListener(self.pivotRenderQueueListener) self.moduleName = "" self.myTerrainManager = MyTerrainManager(self.OgreMainWinSceneMgr) self.moduleManager = ModuleManager(self.ogreRoot, self.OgreMainWinSceneMgr) self.moduleManager.myTerrainManager = self.myTerrainManager self.gocManager = self.moduleManager.gocManager self.ogreMainWindow = OgreMainWindow.OgreMainWindow(self.moduleManager, root, self.OgreMainWinSceneMgr, self) self.gridlayout.addWidget(self.ogreMainWindow,0,0,1,1) self.hboxlayout.addLayout(self.gridlayout) self.setCentralWidget(self.centralwidget) self.myTerrainManager.ogreMainWindow = self.ogreMainWindow oglog = og.LogManager.getSingleton().getDefaultLog() oglog.addListener(self.consoleWindow.lockenLog)
def __init__( self ): self.last_msg = -1 self.msg_flood_limit = 0.25 with open(os.path.join(os.path.dirname(__file__), 'ircbot.conf')) as f: data = json.load(f) self.servers = data['servers'] self.select_server(0) self.db = sqlite3.connect( os.path.join( os.path.dirname( __file__ ), 'ircbot.sqlite3' ), check_same_thread = False ) cursor = self.db.cursor() try: cursor.execute( 'select * from config limit 1' ) except sqlite3.OperationalError: # table no exist cursor.execute( 'create table config ( `group` varchar(100), `key` varchar(100), `value` varchar(100) NULL )' ) cursor.close() self.modules = ModuleManager( self ) self.channel_ops = {} server = self.current_server['host'] port = self.current_server['port'] if 'port' in self.current_server else 6667 password = self.current_server['password'] if 'password' in self.current_server else '' nickname = self.current_server['nickname'] if len(password): SingleServerIRCBot.__init__( self, [( server, port, password )], nickname, nickname, ipv6 = True ) else: SingleServerIRCBot.__init__( self, [( server, port )], nickname, nickname, ipv6 = True ) for module_name in self.modules.get_available_modules(): self.modules.enable_module( module_name )
def initConfig(self): ''' 初始化模块配置, 模块配置通常包括关键字和url两种方式 ''' if not self.config_dict: raise Exception("配置列表为空,请检查!") # 模块管理类 self.module_manager = ModuleManager() self.holder.logging.info("加载模块配置信息") for mode in self.config_dict: self.module_manager.switchToMode(mode) for init_function in self.config_dict[mode]: varnames = inspect.getargspec(init_function).args if len(varnames) == 2: init_function(self.module_manager) else: init_function() pass
class Main(object): def __init__(self): self.module_manager = ModuleManager(self) self.thread_manager = ThreadManager() self.active = True def start(self): self.module_manager.start_modules() while self.active: self.thread_manager.check_scheduled() if not self.active: self.__exit() def __exit(self): self.thread_manager.stop_all() # Hacky way to get a clean exit when exiting through the display exit button: os._exit(0)
def __init__( self ): logging.info('Bot __init__') self.last_msg = -1 self.msg_flood_limit = 0.25 with open(os.path.join(os.path.dirname(__file__), 'ircbot.conf')) as f: data = json.load(f) self.servers = data['servers'] self.select_server(0) self.db = sqlite3.connect( os.path.join( os.path.dirname( __file__ ), 'ircbot.sqlite3' ), check_same_thread = False ) cursor = self.db.cursor() try: cursor.execute( 'select * from config limit 1' ) except sqlite3.OperationalError: # table no exist cursor.execute( 'create table config ( `group` varchar(100), `key` varchar(100), `value` varchar(100) NULL )' ) cursor.close() modules_blacklist = data.get('blacklist', None) self.modules = ModuleManager(self, modules_blacklist) self.channel_ops = {} server = self.current_server['host'] port = self.current_server['port'] if 'port' in self.current_server else 6667 ssl_enabled = self.current_server['ssl'] if 'ssl' in self.current_server else False ipv6_enabled = self.current_server['ipv6'] if 'ipv6' in self.current_server else False password = self.current_server['password'] if 'password' in self.current_server else '' nickname = self.current_server['nickname'] factory = irc.connection.Factory(wrapper=ssl.wrap_socket if ssl_enabled else lambda x: x, ipv6=ipv6_enabled) super(Bot, self).__init__([irc.bot.ServerSpec(server, port, password)], nickname, nickname, connect_factory=factory) self.connection.set_rate_limit(30) for module_name in self.modules.get_available_modules(): self.modules.enable_module( module_name )
class Bot(irc.bot.SingleServerIRCBot): """The main brain of the IRC bot.""" def __init__( self ): logging.info('Bot __init__') self.last_msg = -1 self.msg_flood_limit = 0.25 with open(os.path.join(os.path.dirname(__file__), 'ircbot.conf')) as f: data = json.load(f) self.servers = data['servers'] self.select_server(0) self.db = sqlite3.connect( os.path.join( os.path.dirname( __file__ ), 'ircbot.sqlite3' ), check_same_thread = False ) cursor = self.db.cursor() try: cursor.execute( 'select * from config limit 1' ) except sqlite3.OperationalError: # table no exist cursor.execute( 'create table config ( `group` varchar(100), `key` varchar(100), `value` varchar(100) NULL )' ) cursor.close() modules_blacklist = data.get('blacklist', None) self.modules = ModuleManager(self, modules_blacklist) self.channel_ops = {} server = self.current_server['host'] port = self.current_server['port'] if 'port' in self.current_server else 6667 ssl_enabled = self.current_server['ssl'] if 'ssl' in self.current_server else False ipv6_enabled = self.current_server['ipv6'] if 'ipv6' in self.current_server else False password = self.current_server['password'] if 'password' in self.current_server else '' nickname = self.current_server['nickname'] factory = irc.connection.Factory(wrapper=ssl.wrap_socket if ssl_enabled else lambda x: x, ipv6=ipv6_enabled) super(Bot, self).__init__([irc.bot.ServerSpec(server, port, password)], nickname, nickname, connect_factory=factory) self.connection.set_rate_limit(30) for module_name in self.modules.get_available_modules(): self.modules.enable_module( module_name ) def select_server(self, index): self.current_server = self.servers[index] self.admin = self.current_server['global_admins'] self.admin_channels = self.current_server['admin_channels'] def start( self ): logging.debug( 'start()' ) super(Bot, self).start() def die( self ): logging.debug( 'die()' ) self.modules.unload() self.connection.disconnect( 'Bye, cruel world!' ) #super(Bot, self).die() def __process_message(self, message): for char in '\r\n': message = message.replace(char, '') MAX_MESSAGE_COUNT = 5 MAX_LINE_LEN = 256 m = [] for i in range(0, len(message), MAX_LINE_LEN): if len(m) >= MAX_MESSAGE_COUNT: m.append('(message truncated) ...') break m.append(message[i:i + MAX_LINE_LEN]) return m def notice( self, target, message ): for m in self.__process_message(message): self.connection.notice(target, m) def privmsg( self, target, message ): for m in self.__process_message(message): self.connection.privmsg(target, m) def action( self, target, message ): for m in self.__process_message(message): self.connection.action(target, m) def __module_handle(self, handler, **kwargs): """Passed the "on_*" handlers through to the modules that support them""" handler = 'on_' + handler for (_ , module) in self.modules.get_loaded_modules(): if hasattr(module, handler): try: getattr(module, handler)(**kwargs) except Exception as e: logging.debug('Module handler %s.%s failed: %s', _, handler, e) def __process_command( self, c, e ): """Process a message coming from the server.""" message = e.arguments[0] # commands have to start with ! if message[0] != '!': return # strip the ! off, and split the message args = message[1:].split() # cmd is the first item cmd = args.pop(0).strip() # test for admin admin = e.source.userhost in self.admin if not admin: if e.target in self.admin_channels and e.target in self.channel_ops and e.source.nick in self.channel_ops[ e.target ]: admin = True # nick is the sender of the message, target is either a channel or the sender. source = e.source.nick target = e.target if is_channel(e.target) else source # see if there is a module that is willing to handle this, and make it so. logging.debug( '__process_command (src: %s; tgt: %s; cmd: %s; args: %s; admin: %s)', source, target, cmd, args, admin ) # handle die outside of module (in case module is dead :( ) if admin: if cmd == 'die': self.notice( source, 'Goodbye cruel world!' ) raise BotExitException elif cmd == 'jump': self.jump_server() elif cmd == 'restart_class': raise BotReloadException # config commands elif cmd == 'get_config' and len( args ) <= 2: if len( args ) == 2: try: value = self.get_config( args[0], args[1] ) self.notice( source, 'config[{0}][{1}] = {2}'.format( args[0], args[1], value ) ) except: self.notice( source, 'config[{0}][{1}] not set'.format( *args ) ) elif len( args ) == 1: try: values = self.get_config( args[0] ) if len( values ) > 0: self.notice( source, 'config[{}]: '.format( args[0] ) + ', '.join( [ '{}: "{}"'.format( k,v ) for ( k, v ) in values.items() ] ) ) else: self.notice( source, 'config[{}] is empty'.format( args[0] ) ) except: self.notice( source, 'config[{}] not set'.format( args[0] ) ) else: try: self.notice( source, 'config groups: ' + ', '.join( self.get_config_groups() ) ) except Exception as e: self.notice( source, 'No config groups: {}'.format( e ) ) elif cmd == 'set_config' and len( args ) >= 2: if len( args ) >= 3: config_val = ' '.join( args[2:] ) else: config_val = None try: self.set_config( args[0], args[1], config_val ) self.notice( source, 'Set config setting' if config_val else 'Cleared config setting' ) except Exception as e: self.notice( source, 'Failed setting/clearing config setting: {0}'.format( e ) ) # other base admin commands elif cmd == 'raw': self.connection.send_raw( ' '.join( args ) ) return elif cmd == 'admins': self.notice( source, 'Current operators:' ) self.notice( source, ' - global: {0}'.format( ' '.join( self.admin ) ) ) for chan in [ chan for chan in self.admin_channels if chan in self.channel_ops ]: self.notice( source, ' - {0}: {1}'.format( chan, ' '.join( self.channel_ops[ chan ] ) ) ) return if cmd == 'help': if len( args ) > 0: if args[0] == 'module': if len( args ) < 2: pass elif self.modules.module_is_loaded( args[1] ): module = self.modules.get_module( args[1] ) self.notice( target, module.__doc__ ) else: for ( module_name, module ) in self.modules.get_loaded_modules(): if module.has_cmd( args[0] ): self.notice( target, module.get_cmd( args[0] ).__doc__ ) else: self.notice( target, '!help: this help text (send !help <command> for command help, send !help module <module> for module help)' ) for ( module_name, module ) in [ lst for lst in self.modules.get_loaded_modules() if lst[1].has_commands and not lst[1].admin_only ]: cmds = module.get_cmd_list() self.notice( target, ' * {0}: {1}'.format( module_name, ', '.join( cmds ) if len( cmds ) > 0 else 'No commands' ) ) elif admin and cmd == 'admin_help': if len( args ) > 0: for ( module_name, module ) in self.modules.get_loaded_modules(): if module.has_admin_cmd( args[0] ): self.notice( source, module.get_admin_cmd( args[0] ).__doc__ ) else: self.notice( source, '!admin_help: this help text (send !admin_help <command> for command help' ) self.notice( source, '!die: kill the bot' ) self.notice( source, '!raw: send raw irc command' ) self.notice( source, '!admins: see who are admin' ) self.notice( source, '!restart_class: restart the main Bot class' ) for ( module_name, module ) in self.modules.get_loaded_modules(): cmds = module.get_admin_cmd_list() if len( cmds ) > 0: self.notice( source, ' * {0}: {1}'.format( module_name, ', '.join( cmds ) ) ) else: for ( module_name, module ) in self.modules.get_loaded_modules(): try: if module.has_cmd( cmd ): lines = module.get_cmd( cmd )(args=args, source=source, target=target, admin=admin) if lines: for line in lines: self.notice( target, line ) elif admin and module.has_admin_cmd( cmd ): lines = module.get_admin_cmd(cmd)(args=args, source=source, target=target, admin=admin) if lines: for line in lines: self.notice( source, line ) except Exception as e: logging.exception( "Module '{0}' handle error: {1}".format( module_name, e ) ) def on_privmsg(self, c, e): logging.debug("on_privmsg") source = e.source.nick target = e.target if is_channel( e.target ) else source message = e.arguments[0] self.__module_handle('privmsg', source=source, target=target, message=message) try: self.__process_command( c, e ) except BotExitException as e: raise e except BotReloadException as e: self.connection.disconnect( "Reloading bot..." ) self.modules.unload() raise e except Exception as e: logging.exception( 'Error in __process_command: %s', e ) def on_pubmsg(self, c, e): logging.debug("on_pubmsg") self.on_privmsg(c, e) def on_pubnotice(self, c, e): self.on_notice( c, e ) def on_privnotice(self, c, e): self.on_notice(c, e) def on_notice(self, c, e): source = e.source target = e.target message = e.arguments[0] logging.debug('notice! source: {}, target: {}, message: {}'.format(source, target, message)) self.__module_handle('notice', source=source, target=target, message=message) def on_join(self, connection, event): self.connection.names([event.target]) self.__module_handle('join', connection=connection, event=event) def on_part(self, c, e): self.connection.names([e.target]) def on_kick(self, c, e): self.connection.names([e.target]) def on_mode( self, c, e ): self.connection.names( [e.target] ) def on_endofnames(self, c, e): channel, text = e.arguments if not channel in self.channels: return self.channel_ops[channel] = list(self.channels[channel].opers()) # def on_nick(self, c, e): # self.connection.names(self.channels.keys()) def on_nicknameinuse( self, c, e ): """Gets called if the server complains about the name being in use. Tries to set the nick to nick + '_'""" logging.debug( "on_nicknameinuse" ) c.nick( c.get_nickname() + "_" ) def on_welcome(self, connection, event): for chan in self.current_server['channels']: connection.join( chan ) self.__module_handle('welcome', connection=connection, event=event) def get_config_groups( self ): resultset = self.db.execute( 'select distinct `group` from config' ) return [ g for ( g, ) in resultset.fetchall() ] def get_config( self, group, key = None, default = None ): """gets a config value""" logging.info( 'get config %s.%s', group, key ) if key == None: resultset = self.db.execute( 'select `key`, `value` from config where `group` = :group', { 'group': group } ) values = {} for ( key, value ) in resultset.fetchall(): values[ key ] = value return values else: resultset = self.db.execute( 'select `value` from config where `group` = :group and `key` = :key', { 'group': group, 'key': key } ) value = resultset.fetchone() if value == None: if default != None: return default raise Exception('Value not found') return value[0] def set_config( self, group, key, value ): """sets a config value""" logging.info( 'set config %s.%s to "%s"', group, key, value ) cursor = self.db.cursor() data = { 'group': group, 'key': key, 'value': value } if value == None: cursor.execute( 'delete from config where `group` = :group and `key` = :key', data ) else: try: self.get_config( group, key ) cursor.execute( 'update config set `value` = :value where `group` = :group and `key` = :key', data ) except: cursor.execute( 'insert into config ( `group`, `key`, `value` ) values( :group, :key, :value )', data ) cursor.close() self.db.commit()
class CrawlerBase: """ 爬虫基类,提供通用模块方法的实现,供子类复用 """ # region 初始化方法 # __metaclass__ = LogMetaclass def __init__(self, pinyin, config_dict, check_dict, callback): """ 初始化对象参数 :param pinyin: 省份拼音 :param config_dict: 模块配置字典 :param callback: 外部回调方法 """ # 持有核心业务无关的通用功能对象 self.holder = HolderUtil(pinyin) self.pinyin = pinyin # 爬虫调度委托 self.crawl_delegate = CrawlerControl(self) # 模块配置字典 self.config_dict = config_dict # web内容下载器 self.downloader = DownLoader(pinyin, self.holder.logging) # html内容检查 self.html_check_dict = check_dict['html_check_dict'] if check_dict and 'html_check_dict' in check_dict else None # json内容检查 self.json_check_dict = check_dict['json_check_dict'] if check_dict and 'json_check_dict' in check_dict else None # 初始化搜索列表页是否搜索到公司的判断配置 self.non_company_set = {"无查询结果","未查询到相关记录"} # 外部回调方法,每爬取完一个公司的信息会被调用一次 self.callback = callback # 解析器开关 self.parse_on = False # 除jbxx、gdxx之外的开关,False表示不做解析 self.parse_jbxx_on = True # jbxx开关 self.parse_gdxx_on = True # gdxx开关 # 抓取情况统计 self.statistic = CrawlerStatic(self.holder.logging) # 年报是否需要抓取判断 self.nb_judge = NbxxApiControler() pass def setNonCompanyConfig(self, non_company_set): ''' 设置无此公司判断 :param non_company_set: :return: ''' self.non_company_set = non_company_set def initConfig(self): ''' 初始化模块配置, 模块配置通常包括关键字和url两种方式 ''' if not self.config_dict: raise Exception("配置列表为空,请检查!") # 模块管理类 self.module_manager = ModuleManager() self.holder.logging.info("加载模块配置信息") for mode in self.config_dict: self.module_manager.switchToMode(mode) for init_function in self.config_dict[mode]: varnames = inspect.getargspec(init_function).args if len(varnames) == 2: init_function(self.module_manager) else: init_function() pass def init(self): ''' 爬行初始化,每次爬行会被初始化一次 :return: ''' self.holder.init() # value dictionary: used for passing value between functions self.value_dict = {'ua':self.holder.ua} # all company results self.result_list = list() # company dictionary: used for storing one company value between functions self.result_dict = dict() # all company pages self.page_list = list() # one company html page self.page_dict = dict() # snapshot the middle values for sub modules self.value_dict_snap = dict() # 每次爬行开始初始化downloader self.downloader.firstInit() # 每次爬行的种子report self.report = SeedAccessReport(0, 0, SeedAccessType.ERROR) # 每次爬行前需要恢复初始模块状态 self.initConfig() # endregion #region rowkey 计算 def setRowKey(self, map_dict=None): if not map_dict: map_dict = {'名称':'company_name','注册号':'company_zch','信用代码':'company_zch'} return self.defaultRowKey(map_dict) def defaultRowKey(self, map_dict=None): if 'company' not in self.result_dict: return False if not map_dict: return False rowkey_dict = dict() for v_list in self.result_dict['company']: if not v_list: continue for k,v in v_list.items(): for km in map_dict: if km in k: rowkey_dict[map_dict[km]] = v if rowkey_dict: break self.page_dict['rowkey_dict'] = rowkey_dict if not rowkey_dict: return False return True #endregion # region 爬行入口方法 crawl crawl_url def crawl(self, company_key): self.init() # 切换为通过关键字抓取模式 self.module_manager.switchToMode(CrawlerRunMode.COMPANY_KEY) self.holder.logging.info(u"通过关键词(%s)开始抓取信息" % company_key) self.value_dict['company_key'] = company_key.strip() self.value_dict['search_company'] = company_key.strip() return self._delegateCrawl(CrawlerRunMode.COMPANY_KEY) def crawlUrl(self, company_url, company_name): self.init() # 切换为通过Url抓取模式 self.module_manager.switchToMode(CrawlerRunMode.COMPANY_URL) self.holder.logging.info(u"通过公司名(%s)和url(%s)开始抓取信息" % (company_name, company_url)) self.value_dict['company_url'] = company_url self.value_dict['search_company'] = company_name return self._delegateCrawl(CrawlerRunMode.COMPANY_URL) def _delegateCrawl(self, model): v_dict = copy.deepcopy(self.value_dict) retry_times = 1 while retry_times <= 5: self.crawl_delegate.crawl() # 统计搜索列表中各公司爬取状态,生成seed抓取报告 self.seedReport() self.holder.logging.info(u"本次抓取结果类型:%s" % SeedAccessType.description(self.report.access_type)) if self.report.access_type == SeedAccessType.ERROR: self.holder.logging.info(u"抓取失败,开始第 %s 次重试!" % retry_times) else: break retry_times += 1 self.init() # 切换抓取模式 self.module_manager.switchToMode(model) self.value_dict = copy.deepcopy(v_dict) self.statistic.statistic(self.report, retry_times-1) self.statistic.description() return self.report # endregion # region 模块方法 def visitHomePage(self, module): """ 访问首页 :param module: :return: """ web = self.fetchWebContent(module, u"访问首页,期望输出参数 %s") if not web.body: self.holder.logging.warn(u"获取 (%s) 信息失败" % module.name) return self.parseOutput(module.outputs, web.body) def visitValidateCode(self, module): """ 访问验证码 :param module: :return: """ web = self.fetchWebContent(module, u"访问验证码,期望输出参数 %s", is_pic=True) yzm_type = None if self.holder.debug == 0: yzm_type = self.holder.pinyin if not web.body: self.holder.logging.error(u"获取的验证码图片为空!") return url = module.getInputByType(InputType.URL, self.value_dict, self.holder.logging) try: (yzm, code_id, is_report_error, recChar, img_path) = yzm_util.parse_yzm(url, web.body, 5000, yzm_max_len=6, type=yzm_type, holder=self.holder) except Exception as e: self.downloader.changeProxy() raise Exception(e) self.value_dict['yzm'] = yzm self.value_dict['img_path'] = img_path def visitSearchList(self, module): """ 访问搜索列表 :param module: :return: """ web = self.fetchWebContent(module, u"访问公司列表,期望输出参数 %s") # print("搜索结果列表页:%s" % company_list_html) if not web.body: self.holder.logging.warn(u"获取公司列表信息失败") return # 无此公司判断 for keyword in self.non_company_set: if keyword in web.body: self.holder.logging.warn(u"无此公司!") self.report.access_type = SeedAccessType.NON_COMPANY return if web.access_type == WebAccessType.TOO_OFTEN: self.holder.logging.warning(u"访问过于频繁,可能已被网站禁止访问!!!") self.downloader.insertBlack() return elif web.access_type == WebAccessType.ACCESS_VIOLATION: self.holder.logging.warning(u"非法访问!!!") return self.parseOutput(module.outputs, web.body) pass def visitTopInfo(self, module): """ 访问页面top信息 :param module: :return: """ web = self.fetchSpecificCompany(module, u"访问 (%s) 的Top信息,期望输出参数 %s") self.appendWebContent(u'top_html', web) if not web: return self.value_dict['html'] = web.body # 此处未做解析 self.parseOutput(module.outputs, web.body) pass def visitJbxx(self, module): """ 访问基本信息页面 :param module: :return: """ web = self.fetchSpecificCompany(module, u"访问 (%s) 的基本信息,期望输出参数 %s") self.appendWebContent(u'jbxx_html', web) if not web: return if self.parse_jbxx_on: self.parseHtmlTable(u"解析(%s)的基本信息") if 'company' in self.result_dict: self.value_dict['company'] = self.result_dict['company'] self.parseOutput(module.outputs, web.body) pass def visitGdxx(self, module): """ 访问股东信息 :param module: :return: """ web = self.fetchSpecificCompany(module, u"访问 (%s) 的股东信息,期望输出参数 %s") self.appendWebContent(u'gdxx_html', web) if not web: return self.value_dict['html'] = web.body gdxx_list = None if self.parse_gdxx_on: gdxx_list = self.parseHtmlTable(u"解析(%s)的股东信息") if not gdxx_list or len(gdxx_list) == 0: return all_gdxx_list = [] if 'gdxx_list' in self.value_dict: all_gdxx_list = self.value_dict['gdxx_list'] all_gdxx_list.extend(gdxx_list) self.value_dict['gdxx_list'] = all_gdxx_list def visitGdxq(self, module): """ 访问股东详情 :param module: :return: """ web = self.fetchSpecificCompany(module, u"访问 (%s) 的股东详细信息,期望输出参数 %s") self.appendWebContent(u'gdxq_html', web) if not web: return if self.parse_on: self.parseGdxq() def visitBgxx(self, module): """ 访问变更信息 :param module: :return: """ web = self.fetchSpecificCompany(module, u"访问 (%s) 的变更信息,期望输出参数 %s") self.appendWebContent(u'bgxx_html', web) if not web: return if self.parse_on: self.parseHtmlTable(u"解析(%s)的变更信息") def visitBaxx(self, module): """ 访问备案信息 :param module: :return: """ web = self.fetchSpecificCompany(module, u"访问 (%s) 的备案信息,期望输出参数 %s") self.appendWebContent(u'baxx_html', web) if not web: return if self.parse_on: self.parseHtmlTable(u"解析(%s)的备案信息") def visitFzjg(self, module): """ 访问分支机构 :param module: :return: """ web = self.fetchSpecificCompany(module, u"访问 (%s) 的分支机构信息,期望输出参数 %s") self.appendWebContent(u"fzjg_html", web) if not web: return if self.parse_on: self.parseHtmlTable(u"解析(%s)的分支机构信息") def visitXzcf(self, module): """ 访问行政处罚信息 :param module: :return: """ web = self.fetchSpecificCompany(module, u"访问 (%s) 的行政处罚信息,期望输出参数 %s") self.appendWebContent(u"xzcf_html", web) if not web: return if self.parse_on: self.parseHtmlTable(u"解析(%s)的行政处罚信息") def visitQynbList(self, module): """ 访问行政处罚信息 :param module: :return: """ web = self.fetchSpecificCompany(module, u"访问 (%s) 的企业年报列表,期望输出参数 %s") if not web: return self.parseOutput(module.outputs, web.body) def visitQynb(self, module): """ 访问行政处罚信息 :param module: :return: """ web = self.fetchSpecificCompany(module, u"访问 (%s) 的企业年报信息,期望输出参数 %s") key = u"qynb_%s_html" % self.value_dict['nb_name'] self.appendWebContent(key, web) if not web: return self.parseOutput(module.outputs, web.body) if self.parse_on: self.parseHtmlTable(u"解析(%s)的企业年报信息") def resultCollect(self, module): """ 抓取结果收集,调用ParserMapper实现映射 :param module: :return: """ if 'company' in self.result_dict and self.parse_on: result_list = self.result_dict['company'] company_mapped = ParserMapper.doMap(mapper.transform, result_list) self.result_dict['company_mapped'] = company_mapped self.resultDelivery(module) pass def resultDelivery(self, module): """ 1.清理中间结果集 2.标识页面内容抓取状态类型 3.调用callback交付结果 :param module: :return: """ if 'company_mapped' in self.result_dict: company_mapped = self.result_dict['company_mapped'] else: company_mapped = None self.cleanWebContents() html_dict_wrapped = self.wrapReturnObject() self.page_list.append(self.page_dict) self.page_dict = dict() self.result_list.append(self.result_dict) self.result_dict = dict() self.callback(html_dict_wrapped, company_mapped) pass def visitTopInfoJson(self, module): """ 访问页面顶部json结果 :param module: :return: """ web = self.fetchJson(module, u"访问 (%s) 的Top信息,期望输出参数 %s") self.appendWebContent(u'top_json', web) def visitJbxxJson(self, module): """ 访问基本信息json结果 :param module: :return: """ web = self.fetchJson(module, u"访问 (%s) 的基本信息,期望输出参数 %s") self.appendWebContent(u'jbxx_json', web) if self.parse_jbxx_on: self.parseJson(module) def visitGdxxJson(self, module): """ 访问股东信息json结果 :param module: :return: """ if module.web_content: if module.web_content in self.value_dict: body = self.value_dict[module.web_content] else: body = module.web_content web = WebContent(status_code=200, body=body, content_type=WebContentType.JSON) else: web = self.fetchJson(module, u"访问 (%s) 的股东信息,期望输出参数 %s") self.appendWebContent(u'gdxx_json', web) # 当json内容是由上级模块解析生成,但body为None,说明未解析出需要的输出但不代表是异常状态,例如:长白山森工集团安图林业有限公司安林物流中心分公司 if module.web_content and body is None: return if self.parse_gdxx_on: gdxx_list = self.parseJson(module, web.body) self.value_dict['gdxx_list'] = gdxx_list def visitGdxqJson(self, module): """ 访问股东详情信息json结果 :param module: :return: """ web = self.fetchJson(module, u"访问 (%s) 的股东详情信息,期望输出参数 %s") self.appendWebContent(u'gdxq_json', web) def visitBgxxJson(self, module): """ 访问变更信息json结果 :param module: :return: """ if module.web_content: if module.web_content in self.value_dict: body = self.value_dict[module.web_content] else: body = module.web_content web = WebContent(status_code=200, body=body, content_type=WebContentType.JSON) else: web = self.fetchJson(module, u"访问 (%s) 的变更信息,期望输出参数 %s") self.appendWebContent(u'bgxx_json', web) if self.parse_on: self.parseJson(module, web.body) def visitBaxxJson(self, module): """ 访问备案信息json结果 :param module: :return: """ web = self.fetchJson(module, u"访问 (%s) 的备案信息,期望输出参数 %s") self.appendWebContent(u'baxx_json', web) if self.parse_on: self.parseJson(module) def visitFzjgJson(self, module): """ 访问分支机构json结果 :param module: :return: """ web = self.fetchJson(module, u"访问 (%s) 的分支机构信息,期望输出参数 %s") self.appendWebContent(u'fzjg_json', web) if self.parse_on: self.parseJson(module) def visitXzcfJson(self, module): """ 访问行政处罚json结果 :param module: :return: """ web = self.fetchJson(module, u"访问 (%s) 的行政处罚信息,期望输出参数 %s") self.appendWebContent(u'xzcf_json', web) if self.parse_on: self.parseJson(module) def visitQynbJson(self, module): """ 访问企业年报json结果 :param module: :return: """ web = self.fetchJson(module, u"访问 (%s) 的企业年报信息,期望输出参数 %s") key = u"qynb_%s_json" % self.value_dict['nb_name'] self.appendWebContent(key, web) if self.parse_on: self.parseJson(module) def getWebHtml(self, module): """ 访问获取html页面内容通用模块方法 :param module: :return: """ self.value_dict['html'] = None self.value_dict['web'] = None url, headers, method, post_data = module.getHttpInput(self.value_dict, self.holder.logging) if not url: self.holder.logging.warn(u"缺少url参数") return None encoding = module.getInputByType(InputType.ENCODING, self.value_dict, self.holder.logging) accept_code = module.getInputByType(InputType.STATUS_CODE, self.value_dict, self.holder.logging) self.holder.logging.info(u"访问%s,获取输出参数 %s" % (url, module.outputsDescription())) self.setCookie(module) web = crawler_util.request(downloader=self.downloader, url=url, method=method, headers=headers, data=post_data, encoding=encoding, ua=self.holder.ua, use_proxy=module.use_proxy, holder=self.holder, accept_code=accept_code) # 模块休眠 crawler_util.moduleSleep(module, self.holder) self.htmlContentCheck(web) module.detectWebContent(web=web, log=self.holder.logging) self.value_dict['html'] = web.body if web else None self.value_dict['web'] = web if web and web.body: self.parseOutput(module.outputs, web.body) self.htmlContentCheck(web) def getJson(self, module): """ 访问获取json页面内容通用模块方法 :param module: :return: """ self.value_dict['json'] = None self.value_dict['web'] = None search_company = self.value_dict.get('search_company', '') self.holder.logging.info(u"访问json信息[company_key=%s],获取输出参数 %s" % (search_company, module.outputsDescription())) url, headers, method, post_data = module.getHttpInput(self.value_dict, self.holder.logging) if not url: self.holder.logging.warn(u"缺少url参数") return None encoding = module.getInputByType(InputType.ENCODING, self.value_dict, self.holder.logging) accept_code = module.getInputByType(InputType.STATUS_CODE, self.value_dict, self.holder.logging) self.setCookie(module) web = crawler_util.request(downloader=self.downloader, url=url, method=method, headers=headers, data=post_data, encoding=encoding, ua=self.holder.ua, use_proxy=module.use_proxy, holder=self.holder, accept_code=accept_code) # 模块休眠 crawler_util.moduleSleep(module, self.holder) web.content_type = WebContentType.JSON self.jsonContentCheck(web) module.detectWebContent(web=web, log=self.holder.logging) body = web.body if web.body else '' self.holder.logging.info(u"本次json抓取结果:\n"+body) if body: json_data = json.loads(web.body) self.value_dict['json'] = json_data self.value_dict['web'] = web return web # endregion # region 抓取页面内容 def fetchWebContent(self, module, prompt_info, is_pic=False): """ 抓取搜索列表之前页面 :param module: :param prompt_info: 提示信息 :param is_pic: 是否是获取图片 :return: """ self.value_dict["html"] = None self.value_dict['web'] = None self.holder.logging.info(prompt_info % module.outputsDescription()) url, headers, method, post_data = module.getHttpInput(self.value_dict, self.holder.logging) if not url: self.holder.logging.warn(u"缺少url参数") return None elif url == OutputType.NONE_TYPE: return None encoding = module.getInputByType(InputType.ENCODING, self.value_dict, self.holder.logging) accept_code = module.getInputByType(InputType.STATUS_CODE, self.value_dict, self.holder.logging) self.setCookie(module) web = crawler_util.request(downloader=self.downloader, url=url, method=method, headers=headers, data=post_data, encoding=encoding, ua=self.holder.ua, is_pic=is_pic, use_proxy=module.use_proxy, holder=self.holder, accept_code=accept_code) # 模块休眠 crawler_util.moduleSleep(module, self.holder) self.htmlContentCheck(web) redo_module = self.module_manager.getFirstModule() module.detectWebContent(web=web, redo_module=redo_module.module_id, log=self.holder.logging) self.value_dict['html'] = web.body if web else None self.value_dict['web'] = web return web def fetchSpecificCompany(self, module, prompt_info): """ 抓取具体公司信息页面 :param module: :param prompt_info: 提示信息 :return: """ self.value_dict["html"] = None self.value_dict['web'] = None search_company = self.value_dict.get('search_company', '') self.holder.logging.info(prompt_info % (search_company, module.outputsDescription())) url, headers, method, post_data = module.getHttpInput(self.value_dict, self.holder.logging) if not url: self.holder.logging.warn(u"缺少url参数") return None # 存在输入url在某种情况下为空的情况(广东-深圳信用) elif url == OutputType.NONE_TYPE: return None encoding = module.getInputByType(InputType.ENCODING, self.value_dict, self.holder.logging) accept_code = module.getInputByType(InputType.STATUS_CODE, self.value_dict, self.holder.logging) self.setCookie(module) web = crawler_util.request(downloader=self.downloader, url=url, method=method, headers=headers, data=post_data, encoding=encoding, ua=self.holder.ua, use_proxy=module.use_proxy, holder=self.holder, accept_code=accept_code) # 模块休眠 crawler_util.moduleSleep(module, self.holder) self.htmlContentCheck(web) module.detectWebContent(web=web, log=self.holder.logging) self.value_dict['html'] = web.body if web else None self.value_dict['web'] = web return web def fetchJson(self, module, prompt_info): """ 抓取json页面 :param module: :param prompt_info: :return: """ self.value_dict['json'] = None search_company = self.value_dict.get('search_company', '') self.holder.logging.info(prompt_info % (search_company, module.outputsDescription())) web = self.getJson(module) return web def setCookie(self, module): cookie = module.getInputByType(InputType.COOKIE, self.value_dict, self.holder.logging) if cookie: self.downloader.cookieUpdate(cookie) # endregion # region 解析模块输出、html页面、json页面及股东详情信息 def parseOutput(self, outputs, html): """ 解析模块输出 :param outputs:模块所需要的输出 :param html: 页面内容 :return: """ if not html or not outputs: return tree = etree.HTML(html) for output in outputs: if tree and output.xpath: if output.type == OutputType.LIST: result = tree.xpath(output.xpath) else: result = "".join(tree.xpath(output.xpath)) elif output.regex: if output.type == OutputType.LIST: result = re.findall(output.regex, html) else: result = "".join(re.findall(output.regex, html)) else: continue # 自动合并同名list中间结果 if output.name in self.value_dict and isinstance(self.value_dict[output.name], list) and isinstance(result, list): self.value_dict[output.name].extend(result) else: self.value_dict[output.name] = result def parseHtmlTable(self, prompt_info, should_collect_result=True): """ 解析html table型的数据,解析为键值对的标准形式 :param prompt_info: 提示信息 :param should_collect_result:是否需要收集本次解析结果到结果集中 :return: """ search_company = self.value_dict.get('search_company', '') self.holder.logging.info(prompt_info % search_company) if 'company' not in self.result_dict: self.result_dict['company'] = list() if 'html' not in self.value_dict or not self.value_dict['html']: raise Exception(u"未获取到html页面") html = self.value_dict['html'] parser = TableParseUtil(html) info_list = parser.parse() self.holder.logging.info(u"本次模块解析结果:\n %s", json.dumps(info_list)) # 获取股东详情的情况下不应该加入,而应update if should_collect_result: self.result_dict['company'].extend(info_list) return info_list def parseJson(self, module, json_obj=None): """ 解析json页面内容 :param module: :return: """ if 'json' in self.value_dict: json_obj = self.value_dict['json'] # 此处判断不能简化,需要区分空list和None elif json_obj is None: # raise Exception("未获取到json页面") # 存在网站原因缺少某些信息,例如:长白山森工集团安图林业有限公司安林物流中心分公司,页面上完全无备案信息、分支机构的展示 self.holder.logging.error(u"未获取到json页面!!!") return None if isinstance(json_obj, basestring): json_obj = json.loads(json_obj) if not json_obj: if isinstance(json_obj, list): self.holder.logging.warn(u"成功得到了json页面内容,但json体为空!") else: self.holder.logging.error(u"未获取到json页面!!!") return None parser = JsonParseUtil() info_list = parser.parse(json_obj, module.mapper_config) if not info_list: return None if 'company' not in self.result_dict: self.result_dict['company'] = list() self.result_dict['company'].extend(info_list) self.holder.logging.info(u"本次模块解析结果:\n %s", json.dumps(info_list)) return info_list def parseGdxq(self): """ 解析股东详情信息内容 :return: """ gdxq_list = self.parseHtmlTable(u"解析(%s)的股东详情信息", False) if not gdxq_list or len(gdxq_list) == 0: self.holder.logging.info(u"未获取到股东详情信息") return if 'gdxx_rcd' not in self.value_dict: return gdxx_rcd = self.value_dict['gdxx_rcd'] if not gdxx_rcd or not isinstance(gdxx_rcd, dict): return for key in gdxx_rcd: try: if isinstance(eval(gdxx_rcd[key]), dict): gdxx_rcd[key] = gdxq_list[0] return except Exception as e: self.holder.logging.warn(e.message) key = '' for rcd_key in gdxx_rcd.keys(): if '.' not in rcd_key: continue keys = rcd_key.split('.') key = '' if len(keys) >= 2: for i in range(0, len(keys)-1): key += keys[i]+'.' if key: break key += u'详情' gdxx_rcd[key] = gdxq_list[0] # endregion #region 中间结果状态的保存与恢复 def snapshot(self, snap_id): """ 存储当前中间状态 :param snap_id: 需保存的中间状态id :return: """ self.value_dict_snap[snap_id] = copy.deepcopy(self.value_dict) def recoverFromSnapshot(self, snap_id): """ 从之前保存的中间状态中恢复 :param snap_id: 待恢复的中间状态id :return: """ if not snap_id or snap_id not in self.value_dict_snap: self.holder.logging.warning("snap id %s not exist!!" % snap_id) return self.value_dict = self.value_dict_snap[snap_id] #endregion # region Web页面内容检查 def htmlContentCheck(self, web): """ 验证并封装页面 :param web: :return: """ self.WebKeywordCheck(web, self.html_check_dict) def jsonContentCheck(self, web): """ 验证并封装页面 :param web: :return: """ self.WebKeywordCheck(web, self.json_check_dict) def WebKeywordCheck(self, web, check_dict): """ 验证并封装页面 :param web: :param check_dict: :return: """ if web.access_type != WebAccessType.OK: return if not web.body: web.access_type = WebAccessType.NO_CONTENT return if not web.access_type: web.access_type = WebAccessType.OK if not check_dict: return for key in check_dict: if key in web.body: # 后面配置会覆盖前面配置类型 web.access_type = check_dict[key] self.holder.logging.info(u"页面因包含 '%s' 被识别为类型 %s" % (key,WebAccessType.description(check_dict[key]))) return web # endregion # region WebContent相关处理 def appendWebContent(self, name, web): """ 追加页面内容到页面结果集中 :param name: :param web: :return: """ if name not in self.page_dict: self.page_dict[name] = list() self.page_dict[name].append(web) def cleanWebContents(self): """ 1.清理掉页面结果中的None及重试出错的页面 2.生成该公司抓取情况类型 :return: """ if not self.page_dict: self.page_dict['status'] = CompanyAccessType.ERROR return success_num = 0 failed_num = 0 for key in self.page_dict: values = self.page_dict[key] if not isinstance(values, list): continue if not values: failed_num += 1 continue req_md5_set = set() i = len(values)-1 while i >= 0: val = values[i] if not val: del values[i] elif val.req_md5 in req_md5_set: del values[i] else: req_md5_set.add(val.req_md5) # 暂对股东详情不做要求 if val.status_code >= 400 and key != u'gdxq_html': failed_num += 1 else: success_num += 1 i -= 1 # 过滤掉空值 self.page_dict = dict(filter(lambda item: item[1], self.page_dict.items())) if success_num > 0 and failed_num == 0: self.page_dict['status'] = CompanyAccessType.OK elif success_num > 0: self.page_dict['status'] = CompanyAccessType.INCOMPLETE else: self.page_dict['status'] = CompanyAccessType.ERROR def wrapReturnObject(self): """ 封装web内容结果集返回给外部callback :return: """ # 所有子类调用existQynbList方法后,此处设置rowkey逻辑可以去掉 if 'rowkey_dict' not in self.page_dict: success = self.setRowKey() if not success: self.holder.logging.error(u"提取rowkey参数出错!") html_dict_copy = copy.deepcopy(self.page_dict) for hk,hv in html_dict_copy.items(): if isinstance(hv, list): v_list = filter(lambda x: isinstance(x, WebContent), hv) v_dict_list = map(lambda x: x.toDictionary(), v_list) html_dict_copy[hk] = v_dict_list return html_dict_copy # endregion def bypassQynb(self): """ 判断年报是否需要抓取以及哪些年份已被抓取 :return: 是否访问年报信息,True=不访问,False=访问,set():哪些年份不需要访问 """ # 抓取年报信息需要依赖rowkey,放在此处设置 success = self.setRowKey() if not success: self.holder.logging.error(u"提取rowkey参数出错!") should_visit,has_years = True,set() else: should_visit, has_years = self.nb_judge.visitJudgement(company_name=self.page_dict['rowkey_dict']['company_name'], company_zch=self.page_dict['rowkey_dict']['company_zch']) self.value_dict['qynb_should_visit'] = should_visit self.value_dict['qynb_has_years'] = has_years return not should_visit def filterQynbList(self, nb_list): """ 清理年报列表,将不需要抓取的年份清除出去 :param nb_list: 年报列表,每一项必须是一个包含年份数据的标签,否则需要子类提供 :return: """ should_visit = self.value_dict.get('qynb_should_visit','') has_years = self.value_dict.get('qynb_has_years',set()) if not should_visit: del nb_list[:] return temp_list = list(nb_list) for nb in temp_list: arr = re.findall('\d{4}', ''.join(nb.xpath('text()'))) if not arr: nb_list.remove(nb) continue if arr[0] in has_years: nb_list.remove(nb) def yzmSave(self, yzm, img_path): """ 保存验证码,子类根据需要进行调用 :param img_path: :return: """ record_success(self.pinyin, yzm, img_path, self.holder) pass def getMonitorMiddleValues(self, module): """ 获取被监视的中间结果 :param module: :return: """ if not module.monitor_values: return None mm_dict = dict() for key in module.monitor_values: mm_dict[key] = self.value_dict.get(key, None) return mm_dict def seedReport(self): """ 生成种子抓取情况报告 :return: """ try: if self.report.access_type == SeedAccessType.NON_COMPANY or self.report.access_type == SeedAccessType.NO_VALID_COMPANY: return for page_dict in self.page_list: if not page_dict or 'status' not in page_dict: self.report.failed_num += 1 elif page_dict['status'] == CompanyAccessType.OK: self.report.success_num += 1 else: self.report.failed_num += 1 if self.report.success_num > 0 and self.report.failed_num == 0: self.report.access_type = SeedAccessType.OK elif self.report.success_num > 0: self.report.access_type = SeedAccessType.INCOMPLETE elif self.report.access_type == SeedAccessType.NO_TARGET_SOURCE: return else: self.report.access_type = SeedAccessType.ERROR except Exception as e: self.holder.logging.error(e.message)
class Lockenwickler(QtGui.QMainWindow): def __init__(self, parent=None): QtGui.QWidget.__init__(self, parent) # pixmap = QPixmap("media/icons/lockenwickler_provisorium.png") # splash = QSplashScreen(pixmap, Qt.WindowStaysOnTopHint) # splash.setMask(pixmap.mask()) # splash.showMessage("Starting...") # splash.show() self.setupUi() self.consoleWindow = ConsoleWindow(False, self) self.setupOgre() self.prefDialog = PreferencesDialog(self) self.objectPropertyWin = ObjectPropertyWin(self.OgreMainWinSceneMgr, self.gocManager, self) self.moduleExplorerWin = ModuleExplorer(self) self.modelSelectionDialog = ModelSelectionDialog(self.ogreRoot, self) self.materialSelectionDialog = MaterialSelectionDialog(self.ogreRoot, self) self.moduleManager.modelSelectionDialog = self.modelSelectionDialog self.moduleManager.materialSelectionDialog = self.materialSelectionDialog self.moduleDirectoryViewWin = ModuleDirectoryView(self) triggerManager = TriggerManager() self.gameObjectClassView = GameObjectClassView(self.moduleManager.gocManager) self.createDockWindows() self.mainTimer = QtCore.QTimer(self) self.mainTimer.connect(self.mainTimer, QtCore.SIGNAL("timeout()"), self.update) self.mainTimer.start(5) settings = QtCore.QSettings() self.restoreGeometry(settings.value("MainWindow/Geometry").toByteArray()) self.restoreState(settings.value("MainWindow/DockWindows").toByteArray()) if not self.prefDialog.setCfgPath(settings.value("Preferences/moduleCfgPath").toString()): self.prefDialog.show() self.moduleManager.moduleCfgPath = self.prefDialog.moduleCfgPath else: self.moduleManager.moduleCfgPath = self.prefDialog.moduleCfgPath self.prefDialog.setExternalEditorPath(str(settings.value("Preferences/externalEditorPath").toString())) if self.prefDialog.moduleCfgPath is not None: self.moduleDirectoryViewWin.modulesPath = self.prefDialog.moduleCfgPath.replace("modules.cfg", "") self.moduleManager.setModuleExplorer(self.moduleExplorerWin) self.moduleManager.setModuleDirView(self.moduleDirectoryViewWin) self.moduleManager.setPropertyWindow(self.objectPropertyWin) self.moduleManager.setContextMenuCallback(self.onContextMenuCallback) self.setWindowIcon(QIcon("media/icons/lockenwickler_provisorium_small.png")) self.setWindowTitle("Rastullahs Lockenwickler") self.editorSetupFinished = False # splash.finish(self) def createAction(self, text, slot=None, shortcut=None, icon=None, tip=None, checkable=False, signal="triggered()"): action = QtGui.QAction(text, self) if icon is not None: action.setIcon(QtGui.QIcon("media/icons/%s" % icon)) if shortcut is not None: action.setShortcut(shortcut) if tip is not None: action.setToolTip(tip) action.setStatusTip(tip) if slot is not None: self.connect(action, QtCore.SIGNAL(signal), slot) action.setCheckable(checkable) return action def addActions(self, target, actions): for act in actions: if act is None: target.addSeparator() else: target.addAction(act) def setupUi(self): self.setObjectName("MainWindow") self.centralwidget = QtGui.QWidget(self) self.centralwidget.setObjectName("centralwidget") self.hboxlayout = QtGui.QHBoxLayout(self.centralwidget) self.hboxlayout.setContentsMargins(0, 0, 0, 0) self.hboxlayout.setObjectName("hboxlayout") self.gridlayout = QtGui.QGridLayout() self.gridlayout.setObjectName("gridlayout") self.gridlayout.setContentsMargins(0, 0, 0, 0) self.menubar = QtGui.QMenuBar(self) self.menubar.setObjectName("menubar") self.menuFile = QtGui.QMenu(self.menubar) self.menuFile.setObjectName("menuFile") self.menuEdit = QtGui.QMenu(self.menubar) self.menuEdit.setObjectName("menuEdit") self.menuView = QtGui.QMenu(self.menubar) self.menuView.setObjectName("menuView") self.setMenuBar(self.menubar) self.statusbar = QtGui.QStatusBar(self) self.statusbar.setObjectName("statusbar") self.setStatusBar(self.statusbar) ##################################### self.actionNeu =self.createAction("&New Module", self.actionNewSlot, QKeySequence.New, "filenew.png", "New Module") self.actionNeu.setObjectName("actionNeu") self.actionOpen = self.createAction("&Open Module", self.actionOpenSlot, QKeySequence.Open, "fileopen.png", "Open Module") self.actionOpen.setObjectName("actionOpen") self.actionSave = self.createAction("&Save", self.actionSaveSlot, QKeySequence.Save, "filesave.png", "Save Module") self.actionSave.setObjectName("actionSave") self.actionRunModule = self.createAction("&Save and Run", self.actionRunModuleSlot, "Alt+R", "fileexport.png", "Save And Run Module") self.actionRunModule.setObjectName("actionRunModule") self.actionClose = self.createAction("Quit", self.actionQuitSlot, "Alt+Q", "exit.png", "Quit") self.actionClose.setObjectName("actionQuit") ##################################### ##################################### self.actionDelete = self.createAction("Delete", self.actionDeleteSlot, QKeySequence.Delete, "editdelete.png", "Delete") self.actionDelete.setObjectName("actionDelete") self.actionCopy = self.createAction("Copy", self.actionCopySlot, QKeySequence.Copy, "editcopy.png", "Copy") self.actionCopy.setObjectName("actionCopy") self.actionCut = self.createAction("Cut", self.actionCutSlot, QKeySequence.Cut, "editcut.png", "Cut") self.actionCut.setObjectName("actionCut") self.actionPaste = self.createAction("Paste", self.actionPasteSlot, QKeySequence.Paste, "editpaste.png", "Paste") self.actionPaste.setObjectName("actionPaste") self.actionSelect = self.createAction("&Select", self.actionSelectSlot, "Space", "cursor.png", "Move selected object") self.actionSelect.setObjectName("actionSelect") self.actionMove = self.createAction("&Move", self.actionMoveSlot, "g", "move.png", "Move selected object") self.actionMove.setObjectName("actionMove") self.actionRotate = self.createAction("&Rotate", self.actionRotateSlot, "r", "rotate.png", "Rotate selected object") self.actionRotate.setObjectName("actionRotate") self.actionScale = self.createAction("&Scale", self.actionScaleSlot, "x", "resizecol.png", "Scale selected object") self.actionScale.setObjectName("actionScale") self.actionOneClickEntityPlacement = self.createAction("&OneClickEntityPlacement", self.actionOneClickEntityPlacementSlot, "", "resizecol.png", "Add an Entity just by a click") self.actionOneClickEntityPlacement.setObjectName("actionOneClickEntityPlacement") self.actionOneClickEntityPlacement.setCheckable(True) ##################################### ##################################### self.actionSceneExplorer = self.createAction("&Scene Exlporer", self.toggleModuleExplorer, "Alt+E", "view_tree.png", "Module Explorer", False) self.actionSceneExplorer.setObjectName("actionSceneExplorer") self.actionModuleDirView = self.createAction("&Directory Explorer", self.toggleModuleDirView, "Alt+D", "view_tree.png", "Module Directory Explorer", False) self.actionModuleDirView.setObjectName("actionDirectoryExplorer") self.actionPreferences = self.createAction("&Preferences", self.togglePreferencesWindow, None, "configure.png", "Lockenwickler Preferences", False) self.actionPreferences.setObjectName("actionPreferences") self.actionProperty_Window = self.createAction("Pr&operty Window", self.togglePropertyWindow, "Alt+P", "unsortedlist1.png", "Property Window") self.actionProperty_Window.setObjectName("actionProperty_Window") self.actionObject_Selection = self.createAction("&Model Preview Window", self.toggleModelPreviewWindow, "Alt+O", "tux.png", "Model Preview") self.actionObject_Selection.setObjectName("actionObject_Selection") self.actionMaterial_Selection = self.createAction("Material &Preview Window", self.toggleMaterialPreviewWindow, "Alt+M", "colors.png", "Material Preview") self.actionMaterial_Selection.setObjectName("actionMaterial_Selection") self.actionGameObjectClass_Selection = self.createAction("&Game Object Class Preview Window", self.toggleGameObjectViewWindow, "Ctrl+G", "multirow.png", "GameObjectClass Preview") self.actionGameObjectClass_Selection.setObjectName("actionObject_Selection") self.actionConsole_Window = self.createAction("&Console Window", self.toggleConsoleWindow, "Alt+C", "console.png", "Console Window") self.actionConsole_Window.setObjectName("actionConsole_Window") self.actionTerrainTools_Window = self.createAction("&Terrain Tools", self.toggleTerrainToolsWindow, "Alt+T", "terrain_small.png", "Console Window") self.actionTerrainTools_Window.setObjectName("actionTerrainTools_Window") self.actionToggleViewportGrid = self.createAction("&Toggle Grid", self.toggleViewportGrid, "Alt+G", "console.png", "Toggle Viewport Grid") self.actionToggleViewportGrid.setObjectName("actionToggleViewportGrid") ##################################### ##################################### self.menuFile.addAction(self.actionNeu) self.menuFile.addAction(self.actionOpen) self.menuFile.addAction(self.actionSave) self.menuFile.addAction(self.actionRunModule) self.menuFile.addAction(self.actionClose) self.menuEdit.addAction(self.actionSelect) self.menuEdit.addAction(self.actionMove) self.menuEdit.addAction(self.actionRotate) self.menuEdit.addAction(self.actionScale) self.menuEdit.addSeparator() self.menuEdit.addAction(self.actionDelete) self.menuEdit.addAction(self.actionCopy) self.menuEdit.addAction(self.actionCut) self.menuEdit.addAction(self.actionPaste) self.menuEdit.addSeparator() self.menuEdit.addAction(self.actionOneClickEntityPlacement) self.menuView.addAction(self.actionSceneExplorer) self.menuView.addAction(self.actionModuleDirView) self.menuView.addAction(self.actionPreferences) self.menuView.addAction(self.actionProperty_Window) self.menuView.addAction(self.actionObject_Selection) self.menuView.addAction(self.actionMaterial_Selection) self.menuView.addAction(self.actionGameObjectClass_Selection) self.menuView.addAction(self.actionConsole_Window) self.menuView.addAction(self.actionTerrainTools_Window) self.menuView.addAction(self.actionToggleViewportGrid) self.menubar.addAction(self.menuFile.menuAction()) self.menubar.addAction(self.menuEdit.menuAction()) self.menubar.addAction(self.menuView.menuAction()) self.retranslateUi() QtCore.QMetaObject.connectSlotsByName(self) def retranslateUi(self): self.setWindowTitle(QtGui.QApplication.translate("MainWindow", "MainWindow", None, QtGui.QApplication.UnicodeUTF8)) self.menuFile.setTitle(QtGui.QApplication.translate("MainWindow", "File", None, QtGui.QApplication.UnicodeUTF8)) self.menuEdit.setTitle(QtGui.QApplication.translate("MainWindow", "Edit", None, QtGui.QApplication.UnicodeUTF8)) self.menuView.setTitle(QtGui.QApplication.translate("MainWindow", "View", None, QtGui.QApplication.UnicodeUTF8)) self.actionNeu.setText(QtGui.QApplication.translate("MainWindow", "New Module", None, QtGui.QApplication.UnicodeUTF8)) self.actionMove.setText(QtGui.QApplication.translate("MainWindow", "Move", None, QtGui.QApplication.UnicodeUTF8)) self.actionRotate.setText(QtGui.QApplication.translate("MainWindow", "Rotate", None, QtGui.QApplication.UnicodeUTF8)) self.actionSceneExplorer.setText(QtGui.QApplication.translate("MainWindow", "Module Explorer", None, QtGui.QApplication.UnicodeUTF8)) self.actionPreferences.setText(QtGui.QApplication.translate("MainWindow", "Preferences", None, QtGui.QApplication.UnicodeUTF8)) self.actionProperty_Window.setText(QtGui.QApplication.translate("MainWindow", "Property Window", None, QtGui.QApplication.UnicodeUTF8)) self.actionObject_Selection.setText(QtGui.QApplication.translate("MainWindow", "Object Selection", None, QtGui.QApplication.UnicodeUTF8)) self.actionClose.setText(QtGui.QApplication.translate("MainWindow", "Quit", None, QtGui.QApplication.UnicodeUTF8)) self.actionConsole_Window.setText(QtGui.QApplication.translate("MainWindow", "Console Window", None, QtGui.QApplication.UnicodeUTF8)) def setupOgre(self, pluginCfgPath="./Plugins.cfg", ogreCfgPath="./ogre.cfg", logPath="./ogre.log"): if platform.system() == "Windows": pluginCfgPath="./Plugins-windows.cfg" else: pluginCfgPath="./Plugins-linux.cfg" root = og.Root(pluginCfgPath, ogreCfgPath, logPath) self.ogreRoot = root if not self.ogreRoot.restoreConfig() and not self.ogreRoot.showConfigDialog(): sys.exit('Quit from Config Dialog') root.initialise(False) self.pivotRenderQueueListener = PivotRenderQueueListener() self.OgreMainWinSceneMgr = self.ogreRoot.createSceneManager(og.ST_GENERIC, "OgreMainWinSceneMgr") self.OgreMainWinSceneMgr.ambientLight = og.ColourValue(4, 4, 4) self.OgreMainWinSceneMgr.addRenderQueueListener(self.pivotRenderQueueListener) self.moduleName = "" self.myTerrainManager = MyTerrainManager(self.OgreMainWinSceneMgr) self.moduleManager = ModuleManager(self.ogreRoot, self.OgreMainWinSceneMgr) self.moduleManager.myTerrainManager = self.myTerrainManager self.gocManager = self.moduleManager.gocManager self.ogreMainWindow = OgreMainWindow.OgreMainWindow(self.moduleManager, root, self.OgreMainWinSceneMgr, self) self.gridlayout.addWidget(self.ogreMainWindow,0,0,1,1) self.hboxlayout.addLayout(self.gridlayout) self.setCentralWidget(self.centralwidget) self.myTerrainManager.ogreMainWindow = self.ogreMainWindow oglog = og.LogManager.getSingleton().getDefaultLog() oglog.addListener(self.consoleWindow.lockenLog) def finishEditorSetup(self): if not self.editorSetupFinished: og.ResourceGroupManager.getSingleton().addResourceLocation("./media", "FileSystem", "General", False) og.ResourceGroupManager.getSingleton().initialiseAllResourceGroups() self.moduleManager.pivot = Pivot(self.OgreMainWinSceneMgr) self.moduleManager.pivot.hide() self.editorSetupFinished = True def update(self): self.ogreRoot.renderOneFrame() if platform.system() == "Linux": self.ogreMainWindow.updateRenderWindow() self.modelSelectionDialog.updateRenderWindow() self.materialSelectionDialog.updateRenderWindow() def actionOpenSlot(self): self.finishEditorSetup() self.moduleManager.openLoadModuleDialog() def actionNewSlot(self): newModuleWiz = NewModuleWizard(self.moduleManager, self) newModuleWiz.exec_() return def actionSaveSlot(self): self.moduleManager.save() def actionRunModuleSlot(self): self.moduleManager.save() if platform.system() == "Windows": workingDir = self.prefDialog.moduleCfgPath.replace("/modules/modules.cfg", "") executable = os.path.join(workingDir, "Rastullah.exe") executable = executable.replace("/", "\\") if os.path.isfile(executable): subprocess.Popen([executable, "--module", self.moduleManager.mainModule.name], 0, None, None, None, None, None, False, False, workingDir) def actionQuitSlot(self): self.close() def actionDeleteSlot(self): self.moduleManager.deleteObjects() def actionCopySlot(self): self.moduleManager.copyObjects() def actionCutSlot(self): self.moduleManager.cutObjects() def actionPasteSlot(self): self.moduleManager.pasteObjects(self.ogreMainWindow.getCameraToViewportRay()) def actionSelectSlot(self): self.moduleManager.pivot.hide() def actionMoveSlot(self): self.moduleManager.pivot.setMoveMode() def actionRotateSlot(self): self.moduleManager.pivot.setRotateMode() def actionScaleSlot(self): self.moduleManager.pivot.setScaleMode() def actionOneClickEntityPlacementSlot(self): self.moduleManager.setOneClickEntityPlacement(self.actionOneClickEntityPlacement.isChecked()) def togglePreferencesWindow(self): if self.prefDialog.isHidden(): self.prefDialog.show() else: self.prefDialog.hide() def toggleModelPreviewWindow(self): if self.modelSelectionDock.isHidden(): self.modelSelectionDock.show() else: self.modelSelectionDock.hide() def toggleMaterialPreviewWindow(self): if self.materialSelectionDock.isHidden(): self.materialSelectionDock.show() else: self.materialSelectionDock.hide() def toggleGameObjectViewWindow(self): if self.gameObjectClassViewDock.isHidden(): self.gameObjectClassViewDock.show() else: self.gameObjectClassViewDock.hide() def toggleModuleExplorer(self): if self.moduleExplorerDock.isHidden(): self.moduleExplorerDock.show() else: self.moduleExplorerDock.hide() def toggleModuleDirView(self): if self.moduleDirectoryViewDock.isHidden(): self.moduleDirectoryViewDock.show() else: self.moduleDirectoryViewDock.hide() def togglePropertyWindow(self): if self.propertyDock.isHidden(): self.propertyDock.show() else: self.propertyDock.hide() def toggleConsoleWindow(self): if self.consoleDock.isHidden(): self.consoleDock.show() else: self.consoleDock.hide() def toggleTerrainToolsWindow(self): if self.myTerrainManagerDock.isHidden(): self.myTerrainManagerDock.show() else: self.myTerrainManagerDock.hide() def toggleViewportGrid(self): self.ogreMainWindow.toggleViewportGrid() def createDockWindows(self): self.modelSelectionDock = QtGui.QDockWidget(self.tr("Models"), self) self.modelSelectionDock.setObjectName("ModelSelectionDockWindow") self.modelSelectionDock.setAllowedAreas(QtCore.Qt.LeftDockWidgetArea | QtCore.Qt.RightDockWidgetArea) self.modelSelectionDock.setWidget(self.modelSelectionDialog) self.addDockWidget(QtCore.Qt.RightDockWidgetArea, self.modelSelectionDock) self.materialSelectionDock = QtGui.QDockWidget(self.tr("Materials"), self) self.materialSelectionDock.setObjectName("MaterialSelectionDockWindow") self.materialSelectionDock.setAllowedAreas(QtCore.Qt.LeftDockWidgetArea | QtCore.Qt.RightDockWidgetArea) self.materialSelectionDock.setWidget(self.materialSelectionDialog) self.addDockWidget(QtCore.Qt.RightDockWidgetArea, self.materialSelectionDock) self.tabifyDockWidget(self.modelSelectionDock, self.materialSelectionDock) self.gameObjectClassViewDock = QtGui.QDockWidget(self.tr("GameObjectClasses"), self) self.gameObjectClassViewDock.setObjectName("GameObjectClassView") self.gameObjectClassViewDock.setAllowedAreas(QtCore.Qt.LeftDockWidgetArea | QtCore.Qt.RightDockWidgetArea) self.gameObjectClassViewDock.setWidget(self.gameObjectClassView) self.addDockWidget(QtCore.Qt.RightDockWidgetArea, self.gameObjectClassViewDock) self.tabifyDockWidget(self.modelSelectionDock, self.gameObjectClassViewDock) self.propertyDock = QtGui.QDockWidget(self.tr("Properties"), self) self.propertyDock.setObjectName("PropertyDockWindow") self.propertyDock.setAllowedAreas(QtCore.Qt.LeftDockWidgetArea | QtCore.Qt.RightDockWidgetArea) self.propertyDock.setWidget(self.objectPropertyWin) self.addDockWidget(QtCore.Qt.LeftDockWidgetArea, self.propertyDock) self.moduleExplorerDock = QtGui.QDockWidget(self.tr("Module Explorer"), self) self.moduleExplorerDock.setObjectName("ModuleExplorerDockWindow") self.moduleExplorerDock.setAllowedAreas(QtCore.Qt.LeftDockWidgetArea | QtCore.Qt.RightDockWidgetArea) self.moduleExplorerDock.setWidget(self.moduleExplorerWin) self.addDockWidget(QtCore.Qt.LeftDockWidgetArea, self.moduleExplorerDock) self.tabifyDockWidget(self.moduleExplorerDock, self.propertyDock) self.moduleDirectoryViewDock = QtGui.QDockWidget(self.tr("Module Directory View"), self) self.moduleDirectoryViewDock.setObjectName("ModuleDirectoryViewDockWindow") self.moduleDirectoryViewDock.setAllowedAreas(QtCore.Qt.LeftDockWidgetArea | QtCore.Qt.RightDockWidgetArea | QtCore.Qt.TopDockWidgetArea | QtCore.Qt.BottomDockWidgetArea) self.moduleDirectoryViewDock.setWidget(self.moduleDirectoryViewWin) self.addDockWidget(QtCore.Qt.LeftDockWidgetArea, self.moduleDirectoryViewDock) self.myTerrainManagerDock = self.myTerrainManager.getDockWidget(self) self.consoleDock = QtGui.QDockWidget(self.tr("Console"), self) self.consoleDock.setObjectName("ConsoleDockWindow") self.consoleDock.setAllowedAreas(QtCore.Qt.BottomDockWidgetArea | QtCore.Qt.TopDockWidgetArea) self.consoleDock.setWidget(self.consoleWindow) self.addDockWidget(QtCore.Qt.BottomDockWidgetArea, self.consoleDock) self.fileToolBar = self.addToolBar("File Toolbar") self.fileToolBar.setObjectName("FileToolBar") self.fileToolBar.setAllowedAreas(QtCore.Qt.TopToolBarArea | QtCore.Qt.BottomToolBarArea) self.fileToolBar.addAction(self.actionNeu) self.fileToolBar.addAction(self.actionOpen) self.fileToolBar.addAction(self.actionSave) self.fileToolBar.addAction(self.actionRunModule) self.fileToolBar.addAction(self.actionClose) self.addToolBar(QtCore.Qt.TopToolBarArea, self.fileToolBar) self.moveToolBar = self.addToolBar("Transformation Bar") self.moveToolBar.setObjectName("TransformationBar") self.moveToolBar.setAllowedAreas(QtCore.Qt.TopToolBarArea | QtCore.Qt.BottomToolBarArea) self.moveToolBar.addAction(self.actionSelect) self.moveToolBar.addAction(self.actionMove) self.moveToolBar.addAction(self.actionRotate) self.moveToolBar.addAction(self.actionScale) self.addToolBar(QtCore.Qt.TopToolBarArea, self.moveToolBar) def keyPressEvent(self, event): if not event.isAutoRepeat(): self.ogreMainWindow.keyPressEvent(event) def keyReleaseEvent(self, event): if not event.isAutoRepeat(): self.ogreMainWindow.keyReleaseEvent(event) pass def onContextMenuCallback(self, actions, menus): menu = QMenu("My Menu!!") menu.addAction(self.actionDelete) menu.addAction(self.actionCopy) menu.addAction(self.actionCut) menu.addAction(self.actionPaste) menu.addSeparator() for m in menus: menu.addMenu(m) for a in actions: menu.addAction(a) menu.exec_(QCursor.pos()) def connectActionButtons(self): pass def saveOnClose(self): # reply = QtGui.QMessageBox.question(self, "Rastullahs Lockenwickler - Unsaved Chages", "Save unsaved changes?", QtGui.QMessageBox.Yes|QtGui.QMessageBox.No|QtGui.QMessageBox.Cancel) # if reply == QtGui.QMessageBox.Cancel: # return False # if reply == QtGui.QMessageBox.Yes: # print"" # #TODO: implement save here return True def closeEvent(self, event): if self.saveOnClose(): settings = QtCore.QSettings() settings.setValue("Preferences/moduleCfgPath", QtCore.QVariant(self.prefDialog.lineEdit.text())) settings.setValue("MainWindow/Geometry", QtCore.QVariant(self.saveGeometry())) settings.setValue("MainWIndow/DockWindows", QtCore.QVariant(self.saveState())) settings.setValue("Preferences/externalEditorPath", QtCore.QVariant(self.prefDialog.externalTextAppLineEdit.text())) #self.ogreRoot.shutdown() else: event.ignore()
def main(): preprocesser = PreProcessor() mm = ModuleManager() def generate_random_points_on_hyperellipsoid(vol_data, cor_data, alpha_vec=np.array( [0.9, 0.95, 0.975, 0.99]), n_sample=int(1e4), dim=30): header = alpha_vec result = pd.DataFrame(columns=header) for i in range(vol_data.shape[0]): start_time = time.time() var_estimates = [] vol_mat = np.diag(vol_data.iloc[i, :]) cor_mat = preprocesser.construct_correlation_matrix( corr_vec=cor_data.iloc[i, :], n=dim) H = preprocesser.construct_covariance_matrix(vol_matrix=vol_mat, corr_matrix=cor_mat) r = np.random.randn(H.shape[0], n_sample) # u contains random points on the unit hypersphere u = r / np.linalg.norm(r, axis=0) for alpha in alpha_vec: y = np.sqrt(chi2.ppf(q=alpha, df=dim)) # Transform points on the unit hypersphere to the hyperellipsoid xrandom = sqrtm(H).dot(np.sqrt(y) * u) # Compute the lowest (equally) weighted average of random points on the hyperellipsoid. # This is the maximum loss with alpha percent probability, i.e. Value-at-Risk xrandom_min = np.max( np.abs(np.array([np.mean(x) for x in xrandom.T]))) var_estimates.append(xrandom_min) result = pd.merge(result, pd.DataFrame(np.asarray(var_estimates).reshape( 1, -1), columns=header), how='outer') print((i, time.time() - start_time)) return result ################################################################################################################## ### Multivariate Quantile Computation ### ################################################################################################################## dim = 30 vol_data = mm.load_data( 'multivariate_analysis/volatilities_garch_norm_DJI30_2000_2001.pkl') #cor_data = mm.load_data('multivariate_analysis/cor_DCC_mvnorm_DJI30_1994_1995.pkl') cor_data = mm.load_data( 'multivariate_analysis/pearson/pearson_cor_estimates/cor_knn5_pearson_10_DJI30_2000_2001.pkl' ) result = generate_random_points_on_hyperellipsoid(vol_data=vol_data, cor_data=cor_data) print(result) #mm.save_data('multivariate_analysis/VaR/var_dcc_mvnorm_1994_1995_nsample_1e6.pkl', result) #mm.transform_pickle_to_csv('multivariate_analysis/VaR/var_dcc_mvnorm_1994_1995_nsample_1e6.pkl') mm.save_data( 'multivariate_analysis/VaR/var_knn5_pearson_garch_2000_2001_nsample_1e5_sqrt_chi2.pkl', result) mm.transform_pickle_to_csv( 'multivariate_analysis/VaR/var_knn5_pearson_garch_2000_2001_nsample_1e5_sqrt_chi2.pkl' )
def __init__(self): """Initializer PreProcessor object.""" self.ta = TechnicalAnalyzer() self.mm = ModuleManager()
class PreProcessor(object): """Preprocessor class. This class has the responsibility to preprocess the data. More specifically, the class has the task of simulating random correlated asset paths in the bivariate case. Additionally, the class has the responsibility for estimating the uncertainty in the output variable through a bootstrap resampling procedure.""" def __init__(self): """Initializer PreProcessor object.""" self.ta = TechnicalAnalyzer() self.mm = ModuleManager() def simulate_random_correlation_ar(self, T, a0, a1): """Simulate a random correlation process with highly persistent time-varying correlations following an auto-regressive process. Add noise with ar process :param T: simulation length :param a0: :param a1: :return: random_corr: correlation process following specified dynamics.""" eps = 1e-5 random_corr = np.empty(T) random_corr[0] = a0 / (1 - a1) # initialise random correlation process for t in range(1, T): eta = np.random.normal(0, 0.2) random_corr[t] = np.maximum( -1 + eps, np.minimum(1 - eps, a0 + a1 * random_corr[t - 1] + eta)) return random_corr def simulate_correlated_asset_paths(self, corr_vector, vol_matrix, T): """Simulate asset paths with specified time-varying correlation dynamics. :param corr_vector: time-varying correlation vector :param vol_matrix: volatility matrix :param T: simulation length :return: correlated_asset_paths: simulated asset paths with specified correlation dynamics.""" if corr_vector.ndim == 1: size = 2 else: size = corr_vector.shape[1] # no of columns, i.e. no of assets z = np.random.normal( 0, 1, (T, size)) # T-by-number of assets draws from N(0,1) random variable correlated_asset_paths = np.empty([ T, size ]) # initialise Txsize dimensional array for correlated asset paths for t, rho in enumerate(corr_vector): corr_matrix = self.construct_correlation_matrix(rho, size) cov_matrix = self.construct_covariance_matrix( vol_matrix, corr_matrix) cholesky_factor = self.cholesky_factorization( cov_matrix) # Cholesky decomposition correlated_asset_paths[t] = np.dot( cholesky_factor, z[t].transpose()) # Generating Y_t = H_t^(0.5) * z_t return correlated_asset_paths def construct_correlation_matrix(self, corr_vec, n): """Method for constructing time-varying correlation matrix given a time-varying correlations vector. :param corr_vec: time-varying correlation vector :param n: dimension correlation matrix :return corr_matrix: time-varying correlation matrix""" corr_triu = np.zeros((n, n)) iu1 = np.triu_indices( n, 1 ) # returns indices for upper-triangular matrix with diagonal offset of 1 corr_triu[ iu1] = corr_vec # Assign vector correlations to corresponding upper-triangle matrix indices corr_matrix = corr_triu + corr_triu.T + np.eye( n) # Transform upper-triangular matrix into symmetric matrix return corr_matrix def construct_covariance_matrix(self, vol_matrix, corr_matrix): """Method for constructing time-varying covariance matrix given a time-varying correlations matrix and asset volatility vector. :param vol_matrix: diagonal matrix containing asset volatilities :param corr_matrix: time-varying correlation matrix :return: cov_matrix: time-varying covariance matrix.""" cov_matrix = np.dot(vol_matrix, np.dot(corr_matrix, vol_matrix)) return cov_matrix def cholesky_factorization(self, cov_matrix): """Method for matrix decomposition through Cholesky factorization. The Cholesky factorization states that every symmetric positive definite matrix A has a unique factorization A = LL' where L is a lower-triangular matrix and L' is its conjugate transpose. :param cov_matrix: time-varying positive definite covariance matrix :return: cholesky_factor: cholesky decomposition lower-triangular matrix L such that LL' = cov_matrix""" cholesky_factor = np.linalg.cholesky(cov_matrix) return cholesky_factor def determinant_LU_factorization(self, corr_vec, n): """Method for determining the determinant of a given matrix. Determinants are computed using LU factorization. :param corr_vec: time-varying correlation vector :param n: dimension correlation matrix :return: determinant.""" cor_matrix = self.construct_correlation_matrix(corr_vec, n) det = np.linalg.det(cor_matrix) return det def generate_bivariate_dataset(self, ta, simulated_data_process, dt, proxy_type='pearson', T=500): """Method for generating a bivariate dataset with proxies moving window correlation estimates for covariate set and true correlation as the output variables. :param ta: technical analyzer object :param simulated_data_process: bivariate asset process with predefined correlation dynamics. :param dt: window length :param proxy_type: type definition of proxy for estimates of true correlation :param T: length test set :return: datasets with true correlation and proxy for output variable.""" if proxy_type is 'pearson': pearson_estimates = ta.moving_window_correlation_estimation( simulated_data_process.iloc[:, :2], dt) # Feature set consists of lagged asset price and mw correlation estimate, e.g. x_t = MW_t-1 dataset = simulated_data_process.iloc[:, :2].shift( periods=1, axis='index') # Dataframe dataset['MW_t-1'] = pearson_estimates.shift(periods=1, axis='index') dataset_proxy = dataset.copy() # copy feature matrix # Dataset with true correlations as target variable and proxies dataset['rho_true'] = simulated_data_process['rho'] dataset_proxy['rho_proxy'] = pearson_estimates else: # Kendall as proxy kendall_estimates = ta.moving_window_correlation_estimation( simulated_data_process.iloc[:, :2], dt, proxy_type='kendall') # Feature set consists of lagged asset price and kendall correlation estimate, e.g. x_t = kendall_t-1 dataset = simulated_data_process.iloc[:, :2].shift( periods=1, axis='index') # Dataframe dataset['Kendall_t-1'] = kendall_estimates.shift(periods=1, axis='index') dataset_proxy = dataset.copy() # copy feature matrix # Dataset with true correlations as target variable and proxies dataset['rho_true'] = simulated_data_process['rho'] dataset_proxy['rho_proxy'] = kendall_estimates return dataset, dataset_proxy def generate_multivariate_dataset(self, ta, data, dt, proxy_type='pearson'): """Method for generating a multivariate dataset with moving window estimates as approximation for true correlation constructing the set of covariates and output variable. :param ta: technical analyzer object :param data: dataframe with log returns :param dt: window length :param proxy_type: type definition of proxy for estimates of true correlation :return: dataset with approximated covariates and output variable.""" correlation_estimates = ta.moving_window_correlation_estimation( data, dt, proxy_type=proxy_type) # Feature set consists of lagged kendall correlation estimate amd lagged min. and max. asset returns dataset = correlation_estimates.shift(periods=1, axis='index') dataset['r_min'] = np.min(data, axis=1).shift(periods=1, axis='index') dataset['r_max'] = np.max(data, axis=1).shift(periods=1, axis='index') # Dataset with proxies result = pd.concat([dataset, correlation_estimates], axis=1, join='inner') return result def bootstrap_moving_window_estimate(self, data, delta_t, T=500, reps=1000, ciw=99, proxy_type='pearson'): """Method for measuring the estimation uncertainty associated to the correlation coefficients when moving window estimates are used for approximating true correlations. :param data: dataset used for the task of bootstrap resampling :param T: length of test set :param delta_t: window length for moving window estimates of Pearson correlation coefficient :param reps: number of bootstrap samples :param ciw: confidence interval width :param proxy_type: type definition of proxy for estimates of true correlation (pearson, emw, kendall) :return: correlation estimates with associated estimation uncertainty.""" assets_price = data.tail(T + delta_t - 1).iloc[:, :-1] assets_price.reset_index(drop=True, inplace=True) rho_true = data.tail(T).iloc[:, -1] rho_true.reset_index(drop=True, inplace=True) rho_estimates = np.full(T, np.nan) sd_rho_estimates = np.full( T, np.nan) # bootstrapped standard error of rho estimates lower_percentiles = np.full( T, np.nan) # Initialisation array containing lower percentile values upper_percentiles = np.full( T, np.nan) # Initialisation array containing upper percentile values p_low = (100 - ciw) / 2 p_high = 100 - p_low for j, t in enumerate(range(delta_t, T + delta_t)): sampling_data = np.asarray(assets_price.iloc[t - delta_t:t, :]) # Bootstrap resampling procedure: # draw sample of size delta_t by randomly extracting time units with uniform probability, with replacement. rho_bootstrapped = np.full(reps, np.nan) for rep in range(reps): indices = np.random.randint(low=0, high=sampling_data.shape[0], size=delta_t) sample = sampling_data[indices] if proxy_type is 'emw': # Setup bootstrap procedure for weighted moving window estimates w = self.ta.exponential_weights(delta_t, delta_t / 3) weight_vec_raw = w[indices] sum_w = np.sum(weight_vec_raw) weight_vec_norm = [i / sum_w for i in weight_vec_raw ] # Re-normalize weights to one rho_bootstrapped[rep] = \ self.ta.pearson_weighted_correlation_estimation(sample[:, 0], sample[:, 1], delta_t, weight_vec_norm) elif proxy_type is 'pearson': rho_bootstrapped[rep] = pearsonr(sample[:, 0], sample[:, 1])[0] elif proxy_type is 'kendall': rho_bootstrapped[rep] = kendalltau(sample[:, 0], sample[:, 1])[0] else: print( 'Please, choose an option from the supported set of proxies for true correlations (Pearson ' 'moving window or Kendall moving window') lower, upper = np.nanpercentile(rho_bootstrapped, [p_low, p_high]) lower_percentiles[j] = lower upper_percentiles[j] = upper rho_estimates[j] = np.nanmean(rho_bootstrapped) sd_rho_estimates[j] = np.nanstd(rho_bootstrapped) return rho_estimates, lower_percentiles, upper_percentiles, sd_rho_estimates def bootstrap_learner_estimate(self, data, T=500, reps=1000, ciw=99, model='knn', n_neighbors=5): """"Method for measuring the estimation uncertainty associated to the correlation coefficients when a learner model is used for approximating true correlations. :param data: dataset used for the task of bootstrap resampling :param T: length of test set :param reps: number of bootstrap samples :param ciw: confidence interval width :param model: learner model (e.g. nearest neighbour or random forest regressors) :param n_neighbors: number of multivariate neighbours :return: correlation estimates with associated estimation uncertainty.""" rho_estimates = np.full(T, np.nan) sd_rho_estimates = np.full( T, np.nan) # bootstrapped standard error of rho estimates lower_percentiles = np.full( T, np.nan) # Initialisation array containing lower percentile values upper_percentiles = np.full( T, np.nan) # Initialisation array containing upper percentile values p_low = (100 - ciw) / 2 p_high = 100 - p_low data.drop(data.head(251).index, inplace=True) data.reset_index(drop=True, inplace=True) t_train_init = data.shape[0] - T # 1000 for T = 500 for j, t in enumerate( range(t_train_init, data.shape[0])): # j = {0, 499}, t = {1000, 1499} sampling_data = np.asarray( data.iloc[:t, :]) # True rolling window is [j:t, :] x_test = np.asarray(data.iloc[t, 0:-1]) # This is in fact x_t+1 y_test = np.asarray(data.iloc[t, -1]) # This is in fact y_t+1 # Bootstrap resampling procedure: # draw sample of size train_set by randomly extracting time units with uniform probability, with replacement rho_bootstrapped = np.full(reps, np.nan) for rep in range(reps): indices = np.random.randint(low=0, high=t, size=t) sample = sampling_data[ indices] # Use sample to make a prediction with learner model # Separate data into feature and response components X = np.asarray( sample[:, 0:-1] ) # feature matrix (vectorize data for speed up) y = np.asarray(sample[:, -1]) # response vector X_train = X[0:t, :] y_train = y[0:t] # Obtain estimation uncertainty in Pearson correlation estimation rho_t using bootstrap resampling: if model is 'knn': knn = KNeighborsRegressor( n_neighbors=5) # n_neighbors=len(X_train) rho_bootstrapped[rep] = knn.fit(X_train, y_train).predict( x_test.reshape(1, -1)) elif model is 'rf': rf = RandomForestRegressor(n_jobs=1, n_estimators=10, max_features=1).fit( X_train, y_train) rho_bootstrapped[rep] = rf.predict(x_test.reshape(1, -1)) else: print( 'Please, choose an option from the supported set of learner algorithms (nearest neighbour, ' 'random forest)') lower, upper = np.nanpercentile(rho_bootstrapped, [p_low, p_high]) lower_percentiles[j] = lower upper_percentiles[j] = upper rho_estimates[j] = np.nanmean(rho_bootstrapped) sd_rho_estimates[j] = np.nanstd(rho_bootstrapped) return rho_estimates, lower_percentiles, upper_percentiles, sd_rho_estimates def mse_knn_sensitivity_analysis(self, proxy_type='pearson', output_type='true'): """Method for creation of a dataframe containing information on MSE decomposition as a function of different parameterizations for knn learner model. :param proxy_type: type of moving window estimator used as covariate. :param output_type: output variable true correlation or proxy. :return: dataframe.""" rho_bias_squared = np.full(1001, np.nan) rho_var_vec = np.full(1001, np.nan) rho_mse_vec = np.full(1001, np.nan) # Load mse decomposition data mse_knn5 = self.mm.load_data( 'bivariate_analysis/%s_cor/mse_results_%s_cor/mse_knn5_%s_%s_cor.pkl' % (output_type, output_type, proxy_type, output_type)) mse_knn10 = self.mm.load_data( 'bivariate_analysis/%s_cor/mse_results_%s_cor/mse_knn10_%s_%s_cor.pkl' % (output_type, output_type, proxy_type, output_type)) mse_knn25 = self.mm.load_data( 'bivariate_analysis/%s_cor/mse_results_%s_cor/mse_knn25_%s_%s_cor.pkl' % (output_type, output_type, proxy_type, output_type)) mse_knn50 = self.mm.load_data( 'bivariate_analysis/%s_cor/mse_results_%s_cor/mse_knn50_%s_%s_cor.pkl' % (output_type, output_type, proxy_type, output_type)) mse_knn_100_to_1000 = self.mm.load_data( 'bivariate_analysis/%s_cor/mse_results_%s_cor/' 'mse_knn100_to_1000_%s_%s_cor.pkl' % (output_type, output_type, proxy_type, output_type)) # Creation of dataframe rho_mse_vec[5], rho_bias_squared[5], rho_var_vec[5] = mse_knn5.iloc[ 10, :] rho_mse_vec[10], rho_bias_squared[10], rho_var_vec[ 10] = mse_knn10.iloc[10, :] rho_mse_vec[25], rho_bias_squared[25], rho_var_vec[ 25] = mse_knn25.iloc[10, :] rho_mse_vec[50], rho_bias_squared[50], rho_var_vec[ 50] = mse_knn50.iloc[10, :] rho_mse_vec[100], rho_bias_squared[100], rho_var_vec[ 100] = mse_knn_100_to_1000.iloc[1, :] rho_mse_vec[200], rho_bias_squared[200], rho_var_vec[ 200] = mse_knn_100_to_1000.iloc[2, :] rho_mse_vec[300], rho_bias_squared[300], rho_var_vec[ 300] = mse_knn_100_to_1000.iloc[3, :] rho_mse_vec[400], rho_bias_squared[400], rho_var_vec[ 400] = mse_knn_100_to_1000.iloc[4, :] rho_mse_vec[500], rho_bias_squared[500], rho_var_vec[ 500] = mse_knn_100_to_1000.iloc[5, :] rho_mse_vec[600], rho_bias_squared[600], rho_var_vec[ 600] = mse_knn_100_to_1000.iloc[6, :] rho_mse_vec[700], rho_bias_squared[700], rho_var_vec[ 700] = mse_knn_100_to_1000.iloc[7, :] rho_mse_vec[800], rho_bias_squared[800], rho_var_vec[ 800] = mse_knn_100_to_1000.iloc[8, :] rho_mse_vec[900], rho_bias_squared[900], rho_var_vec[ 900] = mse_knn_100_to_1000.iloc[9, :] rho_mse_vec[1000], rho_bias_squared[1000], rho_var_vec[ 1000] = mse_knn_100_to_1000.iloc[10, :] # Dataframe with information on MSE decomposition as a function of different learner parameterizations data_frame = pd.DataFrame({ 'bias_squared': rho_bias_squared, 'variance': rho_var_vec, 'MSE': rho_mse_vec }) return data_frame def mse_rf_sensitivity_analysis(self, rho_true, proxy_type='pearson', output_type='true', type='trees'): """Method for creation of a dataframe containing information on MSE decomposition as a function of different parameterizations for rf learner model. :param rho_true: vector containing true correlation :param proxy_type: type of moving window estimator used as covariate. :param output_type: output variable true correlation or proxy. :return: dataframe.""" if type is 'trees': rho_bias_squared = np.full(1001, np.nan) rho_var_vec = np.full(1001, np.nan) rho_mse_vec = np.full(1001, np.nan) trees = [10, 100, 300, 600, 1000] # Load mse decomposition data for tree in trees: data = self.mm.load_data( 'bivariate_analysis/%s_cor/%s/results_rf_%s_%s_cor/' 'rf%i_%s_10_estimate_uncertainty_rep_100_%s_corr.pkl' % (output_type, proxy_type, proxy_type, output_type, tree, proxy_type, output_type)) rho_estimates = data['Rho_estimate'] rho_bias_squared[tree] = np.mean( np.power(rho_estimates - rho_true, 2)) rho_var_vec[tree] = np.power(np.mean(data['std rho estimate']), 2) rho_mse_vec = np.array( [np.sum(pair) for pair in zip(rho_bias_squared, rho_var_vec)]) data_frame = pd.DataFrame({ 'bias_squared': rho_bias_squared, 'variance': rho_var_vec, 'MSE': rho_mse_vec }) filename_save = 'mse_rf_%s_%s_cor_sensitivity_analysis_trees.pkl' % ( proxy_type, output_type) self.mm.save_data( 'bivariate_analysis/%s_cor/mse_results_%s_cor/' % (output_type, output_type) + filename_save, data_frame) else: rho_bias_squared = np.full(4, np.nan) rho_var_vec = np.full(4, np.nan) rho_mse_vec = np.full(4, np.nan) # Load mse decomposition data mse_rf300_1_to_3 = self.mm.load_data( 'bivariate_analysis/%s_cor/mse_results_%s_cor/' 'mse_rf300_1_to_3_%s_%s_cor.pkl' % (output_type, output_type, proxy_type, output_type)) rho_mse_vec[1], rho_bias_squared[1], rho_var_vec[ 1] = mse_rf300_1_to_3.iloc[1, :] rho_mse_vec[2], rho_bias_squared[2], rho_var_vec[ 2] = mse_rf300_1_to_3.iloc[2, :] rho_mse_vec[3], rho_bias_squared[3], rho_var_vec[ 3] = mse_rf300_1_to_3.iloc[3, :] # Dataframe with information on MSE decomposition as a function of different learner parameterizations data_frame = pd.DataFrame({ 'bias_squared': rho_bias_squared, 'variance': rho_var_vec, 'MSE': rho_mse_vec }) filename_save = 'mse_rf_%s_%s_cor_sensitivity_analysis_covariates.pkl' % ( proxy_type, output_type) self.mm.save_data( 'bivariate_analysis/%s_cor/mse_results_%s_cor/' % (output_type, output_type) + filename_save, data_frame) return data_frame
def __init__(self): self.module_manager = ModuleManager(self) self.thread_manager = ThreadManager() self.active = True
def main(): preprocesser = PreProcessor() mm = ModuleManager() ta = TechnicalAnalyzer() ################################################################################################################## ### Asset path simulation using Cholesky Factorization and predefined time-varying correlation dynamics ### ################## ############################################################################################### """ T = 1751 a0 = 0.1 a1 = 0.8 random_corr = preprocesser.simulate_random_correlation_ar(T, a0, a1) # Simple volatility matrix with randomly chosen volatilities for illustration purposes vol_matrix = np.array([[0.08, 0], [0, 0.1]]) correlated_asset_paths = preprocesser.simulate_correlated_asset_paths(random_corr, vol_matrix, T) data = pd.DataFrame(correlated_asset_paths) data['rho'] = random_corr mm.save_data('/bivariate_analysis/correlated_sim_data.pkl', data) # Figure correlated_asset_paths = mm.load_data('bivariate_analysis/correlated_sim_data.pkl') correlated_asset_paths = correlated_asset_paths.tail(500) correlated_asset_paths.reset_index(drop=True, inplace=True) plt.plot(correlated_asset_paths.iloc[:, 0], label='$y_{1,t}$', linewidth=1, color='black') plt.plot(correlated_asset_paths.iloc[:, 1], label='$y_{2,t}$', linewidth=1, linestyle='--', color='blue') plt.plot(correlated_asset_paths.iloc[:, -1], label='$\\rho_t$', linewidth=1, color='red') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, fancybox=True, edgecolor='black') plt.xlim(0, 500) plt.ylim(-0.5, 1) plt.show() """ ################################################################################################################## ### Estimation uncertainty in Pearson and Kendall correlation coefficient using moving window estimates ### ################################################################################################################## simulated_data_process = mm.load_data('/bivariate_analysis/correlated_sim_data.pkl') T = 500 delta_t = [21] #np.arange(3, 252) # 3, 4, 5, 6, 7, 8, 9, 10, 21, 42, 63, 84, 126, 251 proxy_type = ['pearson'] # kendall ['mw', 'emw', 'kendall'] ciw = 99 """ for dt, proxy_type in [(x, y) for x in delta_t for y in proxy_type]: start_time = time.time() print('(%s, %i)' % (proxy_type, dt)) rho_estimates, lower_percentiles, upper_percentiles, sd_rho_estimates = \ preprocesser.bootstrap_moving_window_estimate(data=simulated_data_process, delta_t=dt, T=T, ciw=ciw, proxy_type=proxy_type) data_frame = pd.DataFrame({'Percentile_low': lower_percentiles, 'Percentile_up': upper_percentiles, 'std rho estimate': sd_rho_estimates, 'Rho_estimate': rho_estimates}) filename = '%s_%i_estimate_uncertainty.pkl' % (proxy_type, dt) mm.save_data('bivariate_analysis/' + filename, data_frame) print("%s: %f" % ('Execution time:', (time.time() - start_time))) """ """ # Figures for dt, proxy_type in [(x, y) for x in delta_t for y in proxy_type]: data = mm.load_data('bivariate_analysis/results_%s/%s_%i_estimate_uncertainty.pkl' % (proxy_type, proxy_type, dt)) rho_estimates = data['Rho_estimate'] lower_percentiles = data['Percentile_low'] upper_percentiles = data['Percentile_up'] plt.figure() plt.plot(simulated_data_process['rho'], label='true correlation', linewidth=1, color='black') plt.plot(rho_estimates, label='%s correlation' % proxy_type.upper(), linewidth=1, color='red') plt.plot((upper_percentiles-lower_percentiles)-1, label='%d%% interval (bootstrap)' % ciw, linewidth=1, color='magenta') #plt.plot(lower_percentiles, label='%d%% interval (bootstrap)' % ciw, linewidth=1, color='magenta') #plt.plot(upper_percentiles, label="", linewidth=1, color='magenta') plt.xlabel('observation') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, fancybox=True, edgecolor='black') plt.xlim(0, T) plt.yticks(np.arange(-1, 1.00000001, 0.2)) plt.ylim(-1, 1) plt.show() """ ################################################################################################################## ### Mean squared error of Pearson and Kendall correlation coefficient using moving window estimates ### ################################################################################################################## simulated_data_process = mm.load_data('/bivariate_analysis/correlated_sim_data.pkl') T = 500 rho_true = simulated_data_process.tail(T).iloc[:, -1] rho_true.reset_index(drop=True, inplace=True) delta_t_min, delta_t_max = 3, 252 delta_t = np.arange(3, 252) # dt = {[3, 10], 21, 42, 63, 126, 251} (and 84 possibly) proxy_type = ['pearson', 'emw', 'kendall'] # run proxies individually otherwise one saves dataframe over other. rho_bias_squared = np.full(delta_t_max, np.nan) rho_var_vec = np.full(delta_t_max, np.nan) """ # Create dataframe with (interpolated) mse results, squared bias, variance for varying window sizes for proxy_type, dt in [(x, y) for x in proxy_type for y in delta_t]: print('%s, %i' % (proxy_type, dt)) data = mm.load_data('bivariate_analysis/%s_%i_estimate_uncertainty.pkl' % (proxy_type, dt)) rho_estimates = data['Rho_estimate'] rho_bias_squared[dt] = np.mean(np.power(rho_estimates - rho_true, 2)) rho_var_vec[dt] = np.power(np.mean(data['std rho estimate']), 2) rho_mse_vec = np.array([np.sum(pair) for pair in zip(rho_bias_squared, rho_var_vec)]) data_frame = pd.DataFrame({'bias_squared': rho_bias_squared, 'variance': rho_var_vec, 'MSE': rho_mse_vec}) filename = 'mse_%s.pkl' % proxy_type mm.save_data('bivariate_analysis/' + filename, data_frame) """ """ # Kendall correlation estimate for col1, col2, in IT.combinations(simulated_data_process.columns[:-1], 2): def my_tau(idx): df_tau = simulated_data_process[[col1, col2]].iloc[idx+len(simulated_data_process)-T-dt+1] return kendalltau(df_tau[col1], df_tau[col2])[0] kendall_estimates = pd.rolling_apply(np.arange(T+dt-1), dt, my_tau) mse_kendall_vec[dt - 1] = mean_squared_error(rho_true, kendall_estimates[-T:]) mm.save_data('/bivariate_analysis/mse_kendall_true_corr.pkl', mse_kendall_vec) print("%s: %f" % ('Execution time:', (time.time() - start_time))) """ """ # Load MSE data Pearson/ Kendall mse_pearson_vec = mm.load_data('bivariate_analysis/mse_pearson.pkl') mse_kendall_vec = mm.load_data('bivariate_analysis/mse_kendall.pkl') """ """ # Figure without interpolation MSE plt.figure(1) plt.plot(mse_pearson_vec['MSE'], label='Pearson', color='indigo', linewidth=1) plt.plot(mse_kendall_vec['MSE'], label='Kendall', color='aquamarine', linewidth=1, linestyle='--') plt.xlabel('window length') plt.ylabel('MSE') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=5, fancybox=True, edgecolor='black') plt.xlim(0, 250) plt.yticks(np.arange(0, 0.61, 0.1)) plt.ylim(0, 0.6) plt.show() """ """ # Figure without interpolation MSE decomposition plt.figure(2) plt.plot(mse_kendall_vec['bias_squared'], label='Squared Bias', color='blue', linewidth=1) plt.plot(mse_kendall_vec['variance'], label='Variance', color='red', linewidth=1) plt.plot(mse_kendall_vec['MSE'], label='MSE', color='black', linestyle='--', linewidth=1) plt.xlabel('window length') plt.ylabel('MSE') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=5, fancybox=True, edgecolor='black') plt.xlim(0, 250) plt.yticks(np.arange(0, 0.61, 0.1)) plt.ylim(0, 0.6) plt.show() """ """ # Variance in MSE window sizes var_mse_pearson = np.nanvar(mse_pearson_vec['MSE']); print('mse_pearson_var: %f' % var_mse_pearson) var_mse_kendall = np.nanvar(mse_kendall_vec['MSE']); print('mse_kendall_var: %f' % var_mse_kendall) # Max-min in MSE window sizes print('mse_pearson_min_max: (%f, %f)' % (np.nanmin(mse_pearson_vec['MSE']), np.nanmax(mse_pearson_vec['MSE']))) print('mse_kendall_min_max: (%f, %f)' % (np.nanmin(mse_kendall_vec['MSE']), np.nanmax(mse_kendall_vec['MSE']))) """ ################################################################################################################## ### Minimum Determinant Pearson and Kendall Moving Window ### ################################################################################################################## # Get information on the minimum determinants over all corrlation estimates for all window sizes [3, 100] delta_t = range(3, 101) det_min_vec = np.full(101, np.nan) proxy_type = 'pearson' """ for dt in delta_t: # Load data Pearson/ Kendall det_data_vec = np.full(501, np.nan) filename = '%s_%i_estimate_uncertainty.pkl' % (proxy_type, dt) data = mm.load_data('bivariate_analysis/results_%s/%s' % (proxy_type, filename)) # Compute determinants for every dataset for i, rho in enumerate(data['Rho_estimate']): det_data_vec[i+1] = preprocesser.determinant_LU_factorization(rho, 2) det_min_vec[dt] = np.nanmin(det_data_vec) mm.save_data('bivariate_analysis/determinant_min_%s.pkl' % proxy_type, det_min_vec) """ """ # Plot minimum determinants of Pearson and Kendal Moving Window estimates of correlation det_min_pearson = mm.load_data('bivariate_analysis/determinant_min_pearson.pkl') det_min_kendall = mm.load_data('bivariate_analysis/determinant_min_kendall.pkl') plt.figure(1) plt.plot(det_min_pearson, label='Pearson', linewidth=1, color='orange') plt.plot(det_min_kendall, label='Kendall', linewidth=1) plt.xlabel('window length') plt.ylabel('minimum det($R_t)$') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=2, fancybox=True, edgecolor='black') plt.xlim(0, 100) plt.yticks(np.arange(-0.1, 1.1, 0.1)) plt.ylim(-0.1, 1) plt.show() """ ################################################################################################################## ### Dataset creation ### ################################################################################################################## # Pearson and Kendall correlation moving window estimates as covariate and true correlation or moving window # estimate as proxy for output variable simulated_data_process = mm.load_data('/bivariate_analysis/correlated_sim_data.pkl') delta_t_min = 5 delta_t_max = 6 proxy_type = ['kendall'] # ['pearson', 'emw', 'kendall'] """ start_time = time.time() for dt, proxy_type in [(x,y) for x in range(delta_t_min, delta_t_max) for y in proxy_type]: print('(%i, %s)' % (dt, proxy_type)) dataset, dataset_proxy = \ preprocesser.generate_bivariate_dataset(ta, simulated_data_process, dt, proxy_type=proxy_type) mm.save_data('/bivariate_analysis/true_cor/%s/data/dataset_%s_%d.pkl' % (proxy_type, proxy_type, dt), dataset) mm.save_data('/bivariate_analysis/proxy_cor/%s/data/dataset_%s_%d.pkl' % (proxy_type, proxy_type, dt), dataset_proxy) print("%s: %f" % ('Execution time:', (time.time() - start_time))) """ ################################################################################################################## ### Estimation uncertainty in Pearson and Kendall correlation coefficient using machine learner estimates ### ################################################################################################################## simulated_data_process = mm.load_data('/bivariate_analysis/correlated_sim_data.pkl') T = 500 rho_true = simulated_data_process.tail(T).iloc[:, -1] rho_true.reset_index(drop=True, inplace=True) ciw = 99 reps = 1000 delta_t = [21] # dt = {[3, 10], 21, 42, 63, 126, 251} (and 84 possibly) model = ['knn'] # k-nearest neighbour: 'knn', random forest: 'rf' proxy_type = ['pearson', 'kendall'] output_type = ['true', 'proxy'] n_neighbours = [5] """ for dt, proxy_type, model, k, output_type in [(x, y, z, k, o) for x in delta_t for y in proxy_type for z in model for k in n_neighbours for o in output_type]: start_time = time.time() print('(%i, %s, %s, %i)' % (dt, proxy_type, model, k)) dataset = mm.load_data('bivariate_analysis/%s_cor/%s/data/dataset_mw_%i.pkl' % (output_type, proxy_type, dt)) rho_estimates, lower_percentiles, upper_percentiles, sd_rho_estimates = \ preprocesser.bootstrap_learner_estimate(data=dataset, reps=reps, model=model, n_neighbors=k) data_frame = pd.DataFrame({'Percentile_low': lower_percentiles, 'Percentile_up': upper_percentiles, 'std rho estimate': sd_rho_estimates, 'Rho_estimate': rho_estimates}) filename = '%s5_%s_%i_estimate_uncertainty_%s_corr.pkl' % (model, proxy_type, dt, output_type) mm.save_data('bivariate_analysis/%s_cor/%s/results_%s_%s_%s_cor/' % (output_type, proxy_type, model, proxy_type, output_type) + filename, data_frame) print("%s: %f" % ('Execution time', (time.time() - start_time))) """ """ # Figure with bootstrap uncertainty Nearest Neighbors for dt, proxy_type in [(x, y) for x in delta_t for y in proxy_type]: print('(%s, %i)' % (proxy_type, dt)) data = mm.load_data('bivariate_analysis/proxy_cor/%s/results_knn_%s_proxy_cor/' 'knn5_%s_%i_estimate_uncertainty_proxy_corr.pkl' % (proxy_type, proxy_type, proxy_type, dt)) rho_estimates = data['Rho_estimate'] lower_percentiles = data['Percentile_low'] upper_percentiles = data['Percentile_up'] plt.figure() plt.plot(simulated_data_process['rho'], label='true correlation', linewidth=1, color='black') plt.plot(rho_estimates, label='KNN correlation', linewidth=1, color='red') plt.plot((upper_percentiles - lower_percentiles) - 1, label='%d%% interval (bootstrap)' % ciw, linewidth=1, color='magenta') plt.xlabel('observation') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, fancybox=True, edgecolor='black') plt.xlim(0, T) plt.yticks(np.arange(-1, 1.00000001, 0.2)) plt.ylim(-1, 1) plt.show() """ """ # Figure with bootstrap uncertainty Random Forest for proxy_type, output_type in [(x, y) for x in proxy_type for y in output_type]: filename = 'rf10_%s_21_estimate_uncertainty_rep_1000_%s_corr.pkl' % (proxy_type, output_type) print(filename) data = mm.load_data('bivariate_analysis/%s_cor/%s/results_rf_%s_%s_cor/%s' % (output_type, proxy_type, proxy_type, output_type, filename)) rho_estimates = data['Rho_estimate'] lower_percentiles = data['Percentile_low'] upper_percentiles = data['Percentile_up'] plt.figure(1) plt.plot(simulated_data_process['rho'], label='true correlation', linewidth=1, color='black') plt.plot(rho_estimates, label='RF correlation', linewidth=1, color='red') plt.plot((upper_percentiles - lower_percentiles) - 1, label='%d%% interval (bootstrap)' % ciw, linewidth=1, color='magenta') #plt.plot(lower_percentiles, label='%d%% interval (bootstrap)' % ciw, linewidth=1, color='magenta') #plt.plot(upper_percentiles, label="", linewidth=1, color='magenta') plt.xlabel('observation') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, fancybox=True, edgecolor='black') plt.xlim(0, T) plt.yticks(np.arange(-1, 1.1, 0.2)) plt.ylim(-1, 1) plt.show() """ ################################################################################################################## ### Mean squared error of Pearson/Kendall correlation coefficient using machine learner estimates ### ################################################################################################################## simulated_data_process = mm.load_data('/bivariate_analysis/correlated_sim_data.pkl') T = 500 rho_true = simulated_data_process.tail(T).iloc[:, -1] rho_true.reset_index(drop=True, inplace=True) ciw = 99 reps = 1000 delta_t = [10] # range(3, 101) # dt = {[3, 10], 21, 42, 63, 126, 251} (and 84 possibly) model = ['rf'] # k-nearest neighbour: 'knn', random forest: 'rf' proxy_type = ['pearson'] output_type = ['true'] n_neighbour = [10, 100, 300, 600, 1000] # 5, 10, 25, 50, 100, len_train, IDW rho_bias_squared = np.full(1001, np.nan) rho_var_vec = np.full(1001, np.nan) rho_mse_vec = np.full(1001, np.nan) """ # Create dataframe with (interpolated) mse results, squared bias, variance for varying window lengths for model, n_neighbour, proxy_type, dt, output_type in [(w, k, x, y, z) for w in model for k in n_neighbour for x in proxy_type for y in delta_t for z in output_type]: filename = '%s%i_%s_%i_estimate_uncertainty_rep_100_%s_corr.pkl' % (model, n_neighbour, proxy_type, dt, output_type) print(filename) data = mm.load_data('bivariate_analysis/%s_cor/%s/results_%s_%s_%s_cor/' % (output_type, proxy_type, model, proxy_type, output_type) + filename) rho_estimates = data['Rho_estimate'] rho_bias_squared[n_neighbour] = np.mean(np.power(rho_estimates-rho_true, 2)) rho_var_vec[n_neighbour] = np.power(np.mean(data['std rho estimate']), 2) rho_mse_vec = np.array([np.sum(pair) for pair in zip(rho_bias_squared, rho_var_vec)]) data_frame = pd.DataFrame({'bias_squared': rho_bias_squared, 'variance': rho_var_vec, 'MSE': rho_mse_vec}) filename_save = 'mse_%s_%s_%s_cor_sensitivity_analysis_trees.pkl' % (model, proxy_type, output_type) print(filename_save) mm.save_data('bivariate_analysis/%s_cor/mse_results_%s_cor/' % (output_type, output_type) + filename_save, data_frame) """ ## Load MSE data Pearson/ Kendall mse_pearson_vec = mm.load_data('bivariate_analysis/mse_pearson.pkl') mse_kendall_vec = mm.load_data('bivariate_analysis/mse_kendall.pkl') ## Load MSE data KNN # True Correlation mse_knn5_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn5_pearson_true_cor.pkl') mse_knn10_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn10_pearson_true_cor.pkl') mse_knn25_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn25_pearson_true_cor.pkl') mse_knn50_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn50_pearson_true_cor.pkl') mse_knn100_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn100_pearson_true_cor.pkl') mse_knn_len_train_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn_len_train_pearson_true_cor.pkl') mse_knn_IDW_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn_IDW_pearson_true_cor.pkl') mse_knn5_kendall_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn5_kendall_true_cor.pkl') mse_knn10_kendall_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn10_kendall_true_cor.pkl') mse_knn25_kendall_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn25_kendall_true_cor.pkl') mse_knn50_kendall_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn50_kendall_true_cor.pkl') mse_knn100_kendall_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn100_kendall_true_cor.pkl') mse_knn_len_train_kendall_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn_len_train_kendall_true_cor.pkl') mse_knn_IDW_kendall_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn_IDW_kendall_true_cor.pkl') # Proxy Correlation mse_knn5_pearson_proxy = mm.load_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_knn5_pearson_proxy_cor.pkl') mse_knn10_pearson_proxy = mm.load_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_knn10_pearson_proxy_cor.pkl') mse_knn25_pearson_proxy = mm.load_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_knn25_pearson_proxy_cor.pkl') mse_knn50_pearson_proxy = mm.load_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_knn50_pearson_proxy_cor.pkl') mse_knn100_pearson_proxy = mm.load_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_knn100_pearson_proxy_cor.pkl') mse_knn_len_train_pearson_proxy = mm.load_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_knn_len_train_pearson_proxy_cor.pkl') mse_knn_IDW_pearson_proxy = mm.load_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_knn_IDW_pearson_proxy_cor.pkl') mse_knn5_kendall_proxy = mm.load_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_knn5_kendall_proxy_cor.pkl') mse_knn_len_train_kendall_proxy = mm.load_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_knn_len_train_kendall_proxy_cor.pkl') mse_knn_IDW_kendall_proxy = mm.load_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_knn_IDW_kendall_proxy_cor.pkl') ## Load MSE data RF # True Correlation mse_rf10_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_rf10_pearson_true_cor.pkl') mse_rf100_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_rf100_pearson_true_cor.pkl') mse_rf300_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_rf300_pearson_true_cor.pkl') mse_rf1000_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_rf1000_pearson_true_cor.pkl') mse_rf10_kendall_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_rf10_kendall_true_cor.pkl') mse_rf100_kendall_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_rf100_kendall_true_cor.pkl') mse_rf300_kendall_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_rf300_kendall_true_cor.pkl') mse_rf1000_kendall_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_rf1000_kendall_true_cor.pkl') # Proxy Correlation mse_rf10_pearson_proxy = mm.load_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_rf10_pearson_proxy_cor.pkl') mse_rf10_kendall_proxy = mm.load_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_rf10_kendall_proxy_cor.pkl') # Figure without interpolation MSE """ plt.figure(1) plt.plot(mse_pearson_vec['MSE'], label='Pearson', color='indigo', linewidth=1) #plt.plot(mse_kendall_vec['MSE'], label='Kendall', color='cyan', linestyle='--', linewidth=1) plt.plot(mse_knn5_pearson_proxy['MSE'], label='KNN(5)-Pearson', linewidth=1, color='brown') #plt.plot(mse_knn5_kendall_proxy['MSE'], label='KNN(5)-Kendall', linewidth=1, color='xkcd:azure') #plt.plot(mse_knn10_pearson_proxy['MSE'], label='KNN(10)', linewidth=1) #plt.plot(mse_knn25_pearson_proxy['MSE'], label='KNN(25)', linewidth=1) #plt.plot(mse_knn50_pearson_proxy['MSE'], label='KNN(50)', linewidth=1) plt.plot(mse_knn100_pearson_proxy['MSE'], label='KNN(100)', linewidth=1) plt.plot(mse_knn_IDW_pearson_proxy['MSE'], label='KNN(idw)-Pearson', color='black', linewidth=1) plt.plot(mse_rf10_pearson_proxy['MSE'], label='RF(10)', linewidth=1) #plt.plot(mse_knn_IDW_kendall_true['MSE'], label='KNN_kendall_idw', linewidth=1, color='xkcd:azure') #plt.plot(mse_knn_len_train_pearson_true['MSE'], label='KNN_pearson_len_train', linewidth=1) #plt.plot(mse_knn_len_train_pearson_proxy['MSE'], label='KNN_pearson_len_train', color='black', linewidth=1) #plt.plot(mse_knn_IDW_pearson_proxy['MSE'], label='KNN_pearson_IDW', color='black', linewidth=1) plt.xlabel('window length') plt.ylabel('MSE') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=7, fancybox=True, edgecolor='black') plt.xlim(0, 100) plt.yticks(np.arange(0, 0.61, 0.1)) plt.ylim(0, 0.60) plt.show() """ # Figure without interpolation MSE decomposition """ plt.figure(2) plt.plot(mse_knn_IDW_kendall_true['bias_squared'], label='Squared Bias', color='blue', linewidth=1) plt.plot(mse_knn_IDW_kendall_true['variance'], label='Variance', color='red', linewidth=1) plt.plot(mse_knn_IDW_kendall_true['MSE'], label='MSE', color='black', linestyle='--', linewidth=1) plt.xlabel('window length') plt.ylabel('MSE') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, fancybox=True, edgecolor='black') plt.xlim(0, 100) plt.yticks(np.arange(0, 0.31, 0.02)) plt.ylim(0, 0.2) plt.show() """ # Figure with interpolation MSE decomposition sensitivity analysis """ mse_knn_pearson_true_cor_sa = preprocesser.mse_knn_sensitivity_analysis() mm.save_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn_pearson_true_cor_sensitivity_analysis.pkl', mse_knn_pearson_true_cor_sa) mse_knn_kendall_true_cor_sa = preprocesser.mse_knn_sensitivity_analysis(proxy_type='kendall') mm.save_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_knn_kendall_true_cor_sensitivity_analysis.pkl', mse_knn_kendall_true_cor_sa) """ """ mse_knn_pearson_proxy_cor_sa = preprocesser.mse_knn_sensitivity_analysis(output_type='proxy') mm.save_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_knn_pearson_proxy_cor_sensitivity_analysis.pkl', mse_knn_pearson_proxy_cor_sa) mse_knn_kendall_proxy_cor_sa = preprocesser.mse_knn_sensitivity_analysis(proxy_type='kendall', output_type='proxy') mm.save_data('bivariate_analysis/proxy_cor/mse_results_proxy_cor/mse_knn_kendall_proxy_cor_sensitivity_analysis.pkl', mse_knn_kendall_proxy_cor_sa) """ """ plt.figure(3) xs = np.arange(1001) s1mask = np.isfinite(mse_knn_pearson_proxy_cor_sa['bias_squared']) s2mask = np.isfinite(mse_knn_pearson_proxy_cor_sa['variance']) s3mask = np.isfinite(mse_knn_pearson_proxy_cor_sa['MSE']) plt.plot(xs[s1mask], mse_knn_pearson_proxy_cor_sa['bias_squared'][s1mask], label='Squared Bias', color='blue', linestyle='-', linewidth=1, marker='.') plt.plot(xs[s2mask], mse_knn_pearson_proxy_cor_sa['variance'][s2mask], label='Variance', color='red', linestyle='-', linewidth=1, marker='.') plt.plot(xs[s3mask], mse_knn_pearson_proxy_cor_sa['MSE'][s3mask], label='MSE', color='black', linestyle='--', linewidth=1, marker='.') plt.xlabel('number of neighbours') plt.ylabel('MSE') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, fancybox=True, edgecolor='black') plt.xlim(0, 100) plt.xticks([5, 10, 25, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]) plt.yticks(np.arange(0, 0.21, 0.02)) plt.ylim(0, 0.2) plt.show() """ """ # Variance in MSE window sizes for KNN with Pearson/ Kendall as covariates. # True Correlation #var_mse_knn5_pearson_true = np.nanvar(mse_knn5_pearson_true['MSE']); print('mse_knn5_pearson_var: %.8f' % var_mse_knn5_pearson_true) #var_mse_knn5_kendall_true = np.nanvar(mse_knn5_kendall_true['MSE']); print('mse_knn5_kendall_var: %.8f' % var_mse_knn5_kendall_true) #var_mse_knn_len_train_pearson_true = np.nanvar(mse_knn_len_train_pearson_true['MSE']); print('mse_knn_len_train_pearson_var: %.13f' % var_mse_knn_len_train_pearson_true) #var_mse_knn_IDW_pearson_true = np.nanvar(mse_knn_IDW_pearson_true['MSE']); print('mse_knn_IDW_pearson_var: %.9f' % var_mse_knn_IDW_pearson_true) #var_mse_knn_len_train_kendall_true = np.nanvar(mse_knn_len_train_kendall_true['MSE']); print('mse_knn_len_train_pearson_var: %f' % var_mse_knn_len_train_kendall_true) #var_mse_knn_IDW_kendall_true = np.nanvar(mse_knn_IDW_kendall_true['MSE']); print('mse_knn_IDW_pearson_var: %f' % var_mse_knn_IDW_kendall_true) # Proxy Correlation #var_mse_knn5_pearson_proxy = np.nanvar(mse_knn5_pearson_proxy['MSE']); print('mse_knn5_pearson_proxy_var: %.6f' % var_mse_knn5_pearson_proxy) #var_mse_knn5_kendall_proxy = np.nanvar(mse_knn5_kendall_proxy['MSE']); print('mse_knn5_kendall_proxy_var: %.6f' % var_mse_knn5_kendall_proxy) #var_mse_knn_len_train_pearson_proxy = np.nanvar(mse_knn_len_train_pearson_proxy['MSE']); print('mse_knn_len_train_pearson_proxy_var: %.8f' % var_mse_knn_len_train_pearson_proxy) #var_mse_knn_len_train_kendall_proxy = np.nanvar(mse_knn_len_train_kendall_proxy['MSE']); print('mse_knn_len_train_kendall_proxy_var: %.9f' % var_mse_knn_len_train_kendall_proxy) #var_mse_knn_IDW_pearson_proxy = np.nanvar(mse_knn_IDW_pearson_proxy['MSE']); print('mse_knn_IDW_pearson_proxy_var: %.8f' % var_mse_knn_IDW_pearson_proxy) #var_mse_knn_IDW_kendall_proxy = np.nanvar(mse_knn_IDW_kendall_proxy['MSE']); print('mse_knn_IDW_kendall_proxy_var: %.8f' % var_mse_knn_IDW_kendall_proxy) # Max-min in MSE window sizes for KNN with Pearson/ Kendall as covariates. # True Correlation #print('mse_knn5_pearson_min_max: (%.4f, %.4f)' % (np.nanmin(mse_knn5_pearson_true['MSE']), np.nanmax(mse_knn5_pearson_true['MSE']))) #print('mse_knn5_kendall_min_max: (%.4f, %.4f)' % (np.nanmin(mse_knn5_kendall_true['MSE']), np.nanmax(mse_knn5_kendall_true['MSE']))) #print('mse_knn_len_train_pearson_min_max: (%.4f, %.4f)' % (np.nanmin(mse_knn_len_train_pearson_true['MSE']), np.nanmax(mse_knn_len_train_pearson_true['MSE']))) #print('mse_knn_IDW_pearson_min_max: (%.4f, %.4f)' % (np.nanmin(mse_knn_IDW_pearson_true['MSE']), np.nanmax(mse_knn_IDW_pearson_true['MSE']))) #print('mse_knn_len_train_kendall_min_max: (%.4f, %.4f)' % (np.nanmin(mse_knn_len_train_kendall_true['MSE']), np.nanmax(mse_knn_len_train_kendall_true['MSE']))) #print('mse_knn_IDW_kendall_min_max: (%.4f, %.4f)' % (np.nanmin(mse_knn_IDW_kendall_true['MSE']), np.nanmax(mse_knn_IDW_kendall_true['MSE']))) # Proxy Correlation #print('mse_knn5_pearson_proxy_min_max: (%.4f, %.4f)' % (np.nanmin(mse_knn5_pearson_proxy['MSE']), np.nanmax(mse_knn5_pearson_proxy['MSE']))) #print('mse_knn5_kendall_proxy_min_max: (%.4f, %.4f)' % (np.nanmin(mse_knn5_kendall_proxy['MSE']), np.nanmax(mse_knn5_kendall_proxy['MSE']))) #print('mse_knn_len_train_pearson_proxy_min_max: (%.4f, %.4f)' % (np.nanmin(mse_knn_len_train_pearson_proxy['MSE']), np.nanmax(mse_knn_len_train_pearson_proxy['MSE']))) #print('mse_knn_len_train_kendall_proxy_min_max: (%.4f, %.4f)' % (np.nanmin(mse_knn_len_train_kendall_proxy['MSE']), np.nanmax(mse_knn_len_train_kendall_proxy['MSE']))) #print('mse_knn_IDW_pearson_proxy_min_max: (%.4f, %.4f)' % (np.nanmin(mse_knn_IDW_pearson_proxy['MSE']), np.nanmax(mse_knn_IDW_pearson_proxy['MSE']))) #print('mse_knn_IDW_kendall_proxy_min_max: (%.4f, %.4f)' % (np.nanmin(mse_knn_IDW_kendall_proxy['MSE']), np.nanmax(mse_knn_IDW_kendall_proxy['MSE']))) """ """ # Variance in MSE window sizes for RF with Pearson/ Kendall as covariates. # True Correlation #var_mse_rf10_pearson_true = np.nanvar(mse_rf10_pearson_true['MSE']); print('var_mse_rf10_pearson_true: %.8f' % var_mse_rf10_pearson_true) #var_mse_rf10_kendall_true = np.nanvar(mse_rf10_kendall_true['MSE']); print('var_mse_rf10_kendall_true: %.8f' % var_mse_rf10_kendall_true) # Proxy Correlation var_mse_rf10_pearson_proxy = np.nanvar(mse_rf10_pearson_proxy['MSE']); print('var_mse_rf10_pearson_proxy: %.6f' % var_mse_rf10_pearson_proxy) var_mse_rf10_kendall_proxy = np.nanvar(mse_rf10_kendall_proxy['MSE']); print('var_mse_rf10_kendall_proxy: %.6f' % var_mse_rf10_kendall_proxy) # Max-min in MSE window sizes for RF with Pearson/ Kendall as covariates. # True Correlation #print('mse_rf10_pearson_min_max: (%.4f, %.4f)' % (np.nanmin(mse_rf10_pearson_true['MSE']), np.nanmax(mse_rf10_pearson_true['MSE']))) #print('mse_rf10_kendall_min_max: (%.4f, %.4f)' % (np.nanmin(mse_rf10_kendall_true['MSE']), np.nanmax(mse_rf10_kendall_true['MSE']))) # Proxy Correlation print('mse_rf10_pearson_proxy_min_max: (%.4f, %.4f)' % (np.nanmin(mse_rf10_pearson_proxy['MSE']), np.nanmax(mse_rf10_pearson_proxy['MSE']))) print('mse_rf10_kendall_proxy_min_max: (%.4f, %.4f)' % (np.nanmin(mse_rf10_kendall_proxy['MSE']), np.nanmax(mse_rf10_kendall_proxy['MSE']))) """ """ # Figure without interpolation MSE plt.figure(4) plt.plot(mse_knn10_pearson_proxy['MSE'], label='KNN(10)-Pearson', linewidth=1) plt.plot(mse_pearson_vec['MSE'], label='Pearson', color='indigo', linewidth=1) #plt.plot(mse_kendall_vec['MSE'], label='Kendall', color='cyan', linestyle='--', linewidth=1) plt.plot(mse_knn_IDW_pearson_proxy['MSE'], label='KNN(idw)-Pearson', color='black', linewidth=1) plt.plot(mse_knn100_pearson_proxy['MSE'], label='KNN(100)-Pearson', color='red', linewidth=1) plt.plot(mse_rf10_pearson_proxy['MSE'], label='RF(10)-Pearson', color='goldenrod', linewidth=1) #plt.plot(mse_rf10_kendall_proxy['MSE'], label='RF(10)-Kendall', color='xkcd:teal', linewidth=1) plt.xlabel('window length') plt.ylabel('MSE') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.13), ncol=3, fancybox=True, edgecolor='black') plt.xlim(0, 100) plt.yticks(np.arange(0, 0.61, 0.1)) plt.ylim(0, 0.6) plt.show() """ # Figure without interpolation MSE decomposition """ mse_dt_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_dt_pearson_true_cor.pkl') mse_rf10_2_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_rf10_2_pearson_true_cor.pkl') mse_rf10_3_pearson_true = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_rf10_3_pearson_true_cor.pkl') plt.figure(5) plt.plot(mse_rf10_kendall_proxy['bias_squared'], label='Squared Bias', color='blue', linewidth=1) plt.plot(mse_rf10_kendall_proxy['variance'], label='Variance', color='red', linewidth=1) plt.plot(mse_rf10_kendall_proxy['MSE'], label='MSE', color='black', linestyle='--', linewidth=1) #plt.plot(mse_dt_pearson_true, label='dt_squared_bias', linewidth=1) plt.xlabel('window length') plt.ylabel('MSE') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, fancybox=True, edgecolor='black') plt.xlim(0, 100) plt.yticks(np.arange(0, 0.61, 0.02)) plt.ylim(0, 0.3) plt.show() """ """ # Figure with interpolation MSE decomposition sensitivity analysis number of covariates mse_rf_pearson_true_cor_sa = mm.load_data('bivariate_analysis/true_cor/mse_results_true_cor/mse_rf300_1_to_3_pearson_true_cor.pkl') plt.figure(3) xs = np.arange(4) s1mask = np.isfinite(mse_rf_pearson_true_cor_sa['bias_squared']) s2mask = np.isfinite(mse_rf_pearson_true_cor_sa['variance']) s3mask = np.isfinite(mse_rf_pearson_true_cor_sa['MSE']) plt.plot(xs[s1mask], mse_rf_pearson_true_cor_sa['bias_squared'][s1mask], label='Squared Bias', color='blue', linestyle='-', linewidth=1, marker='.') plt.plot(xs[s2mask], mse_rf_pearson_true_cor_sa['variance'][s2mask], label='Variance', color='red', linestyle='-', linewidth=1, marker='.') plt.plot(xs[s3mask], mse_rf_pearson_true_cor_sa['MSE'][s3mask], label='MSE', color='black', linestyle='--', linewidth=1, marker='.') plt.xlabel('number of covariates') plt.ylabel('MSE') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, fancybox=True, edgecolor='black') plt.xlim(0, 3) plt.xticks([0, 1, 2, 3]) plt.yticks(np.arange(0, 0.21, 0.02)) plt.ylim(0, 0.2) plt.show() """ """ # Figure with interpolation MSE decomposition sensitivity analysis number of trees mse_rf_pearson_true_cor_sa_trees = preprocesser.mse_rf_sensitivity_analysis(rho_true=rho_true) mse_rf_kendall_true_cor_sa_trees = preprocesser.mse_rf_sensitivity_analysis( rho_true=rho_true, proxy_type='kendall', output_type='true', type='trees') mse_rf_pearson_proxy_cor_sa_trees = preprocesser.mse_rf_sensitivity_analysis(rho_true=rho_true, output_type='proxy') mse_rf_kendall_proxy_cor_sa_trees = preprocesser.mse_rf_sensitivity_analysis( rho_true=rho_true, proxy_type='kendall', output_type='proxy', type='trees') plt.figure(4) xs = np.arange(1001) s1mask = np.isfinite(mse_rf_kendall_true_cor_sa_trees['bias_squared']) s2mask = np.isfinite(mse_rf_kendall_true_cor_sa_trees['variance']) s3mask = np.isfinite(mse_rf_kendall_true_cor_sa_trees['MSE']) plt.plot(xs[s1mask], mse_rf_kendall_true_cor_sa_trees['bias_squared'][s1mask], label='Squared Bias', color='blue', linestyle='-', linewidth=1, marker='.') plt.plot(xs[s2mask], mse_rf_pearson_true_cor_sa_trees['variance'][s2mask], label='Variance', color='red', linestyle='-', linewidth=1, marker='.') plt.plot(xs[s3mask], mse_rf_pearson_true_cor_sa_trees['MSE'][s3mask], label='MSE', color='black', linestyle='--', linewidth=1, marker='.') plt.xlabel('number of estimators') plt.ylabel('MSE') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, fancybox=True, edgecolor='black') plt.xlim(0, 1000) plt.xticks([10, 100, 300, 600, 1000]) plt.yticks(np.arange(0, 0.21, 0.02)) plt.ylim(0, 0.2) plt.show() """ ################################################################################################################## ### Minimum Determinant Learning Algorithms ### ################################################################################################################## # Rho_estimate # Get information on the minimum determinants over all correlation estimates for all window sizes [3, 100] delta_t = range(3, 101) det_min_vec = np.full(101, np.nan) proxy_type = 'pearson' output_type = 'true' learner = 'rf' """ for dt in delta_t: # Load data Pearson/ Kendall det_data_vec = np.full(501, np.nan) filename = '%s10_%s_%i_estimate_uncertainty_rep_1000_%s_corr.pkl' % (learner, proxy_type, dt, output_type) print(filename) data = mm.load_data('bivariate_analysis/%s_cor/%s/results_%s_%s_%s_cor/%s' % (output_type, proxy_type, learner, proxy_type, output_type, filename)) # Compute determinants for every dataset for i, rho in enumerate(data['Rho_estimate']): det_data_vec[i+1] = preprocesser.determinant_LU_factorization(rho, 2) det_min_vec[dt] = np.nanmin(det_data_vec) filename_save = 'determinant_min_%s10_%s_%s_cor.pkl' % (learner, proxy_type, output_type) mm.save_data('bivariate_analysis/%s_cor/det_results_%s_cor/%s' % (output_type, output_type, filename_save), det_min_vec) """ # Plot minimum determinants of KNN estimates of correlation # True Cor det_min_knn5_pearson = mm.load_data('bivariate_analysis/true_cor/det_results_true_cor/determinant_min_knn5_pearson_true_cor.pkl') det_min_knn5_kendall = mm.load_data('bivariate_analysis/true_cor/det_results_true_cor/determinant_min_knn5_kendall_true_cor.pkl') det_min_knn_len_train_pearson = mm.load_data('bivariate_analysis/true_cor/det_results_true_cor/determinant_min_knn_len_train_pearson_true_cor.pkl') det_min_knn_len_train_kendall = mm.load_data('bivariate_analysis/true_cor/det_results_true_cor/determinant_min_knn_len_train_kendall_true_cor.pkl') det_min_knn_IDW_pearson = mm.load_data('bivariate_analysis/true_cor/det_results_true_cor/determinant_min_knn_IDW_pearson_true_cor.pkl') det_min_knn_IDW_kendall = mm.load_data('bivariate_analysis/true_cor/det_results_true_cor/determinant_min_knn_IDW_kendall_true_cor.pkl') # Proxy Cor det_min_knn5_pearson_proxy = mm.load_data('bivariate_analysis/proxy_cor/det_results_proxy_cor/determinant_min_knn5_pearson_proxy_cor.pkl') det_min_knn5_kendall_proxy = mm.load_data('bivariate_analysis/proxy_cor/det_results_proxy_cor/determinant_min_knn5_kendall_proxy_cor.pkl') det_min_knn_len_train_pearson_proxy = mm.load_data('bivariate_analysis/proxy_cor/det_results_proxy_cor/determinant_min_knn_len_train_pearson_proxy_cor.pkl') det_min_knn_len_train_kendall_proxy = mm.load_data('bivariate_analysis/proxy_cor/det_results_proxy_cor/determinant_min_knn_len_train_kendall_proxy_cor.pkl') det_min_knn_IDW_pearson_proxy = mm.load_data('bivariate_analysis/proxy_cor/det_results_proxy_cor/determinant_min_knn_IDW_pearson_proxy_cor.pkl') det_min_knn_IDW_kendall_proxy = mm.load_data('bivariate_analysis/proxy_cor/det_results_proxy_cor/determinant_min_knn_IDW_kendall_proxy_cor.pkl') """ plt.figure(1) plt.plot(det_min_knn_IDW_pearson_proxy, label='KNN(idw)-Pearson', linewidth=1, color='orange') plt.plot(det_min_knn_IDW_kendall_proxy, label='KNN(idw)-Kendall', linewidth=1) plt.plot(det_min_knn_len_train_pearson_proxy, label='KNN(unif)-Pearson', linewidth=1) plt.plot(det_min_knn_len_train_kendall_proxy, label='KNN(unif)-Kendall', linewidth=1) plt.xlabel('window length') plt.ylabel('minimum det($R_t)$') plt.legend(fontsize='small', loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=2, fancybox=True, edgecolor='black') plt.xlim(0, 100) plt.yticks(np.arange(-0.1, 1.1, 0.1)) plt.ylim(-0.1, 1) plt.show() """ """