def feed(self, pool, dbi): from lib import db, logger Ctlr_Base.feed(self, pool, dbi) extra = {'classname': self.__class__} for rss in self._my_feeds: if ('title' not in rss): rss['title'] = None if not ('url' in rss and rss['url']): logger.warning('Bad rss host url for %s(%s)', rss['title'], rss.get('url', None), extra=extra) continue if ('host_url' not in rss): rss['host_url'] = self.get_host()['url'] db.save_feed(rss, dbi=dbi) db.save_ctlr_feed( { 'url': rss['url'], 'classname': str(self.__class__) }, dbi=dbi) logger.info('%s queued', rss['url'], extra=extra) pool.put(rss['url'], self.dispatch_rss_2_0, category=self._parser['format'])
def feed(self, pool, dbi): from lib import db, logger Ctlr_Base.feed(self, pool, dbi) extra={'classname': self.__class__} for rss in self._my_feeds: if ('title' not in rss): rss['title'] = None if not ('url' in rss and rss['url']): logger.warning('Bad rss host url for %s(%s)', rss['title'], rss.get('url', None), extra=extra) continue if ('host_url' not in rss): rss['host_url'] = self.get_host()['url'] db.save_feed(rss, dbi = dbi) db.save_ctlr_feed({ 'url': rss['url'], 'classname': str(self.__class__) }, dbi = dbi) logger.info('%s queued', rss['url'], extra=extra) pool.put(rss['url'], self.dispatch_rss_2_0, category = self._parser['format'])
def detect_word(): global previous_word if 'template.png' in hook.get_image_url(): return '' if hook.get_image_url() not in word_dictionary: logger.warning('Unregistered word image: ' + hook.get_image_url()) return '' logger.info('Get word ' + word_dictionary[hook.get_image_url()]) return word_dictionary[hook.get_image_url()]
def fetch(payload, dbi=None): """抓取 payload['url'] 的檔案 並將最終讀取到的 url 寫入 payload['url_read'], response 寫入 payload['src'] """ import re from lxml.html import fromstring from lib import db, DB, logger from lib.util.text import to_unicode extra = {'classname': 'util.net.fetch()'} try: uo = urlopen(payload['url'], timeout=HTTP_TIMEOUT) if (uo.code != 200): raise IOError("HTTP response code=%d from %s" % (uo.code, uo.url)) portal = get_portal(uo.url) if portal: break_portal(portal, payload, uo) else: payload['src'] = uo.read() payload['url_read'] = uo.url except Exception as e: # 抓取出錯,留待記錄 (save_fetch) payload['src'] = 'error ' + unicode(e) payload['category'] = 'error' payload['exception'] = e if 'url_read' not in payload: payload['url_read'] = payload['url'] if dbi is None: _dbi = DB() else: _dbi = dbi try: db.save_fetch(payload['url'], to_unicode(payload['src']), payload['category'], dbi=_dbi) except Exception as e: logger.warning('DB save_fetch failed for url %s' % payload['url'], extra=extra) logger.debug(e) if dbi is None: _dbi.disconnect() if 'error' == payload['category']: # raise the exception to skip the parsing process logger.info("failed fetching %s" % payload['url'], extra=extra) raise payload['exception'] return payload
def fetch(payload, dbi = None): """抓取 payload['url'] 的檔案 並將最終讀取到的 url 寫入 payload['url_read'], response 寫入 payload['src'] """ import re from lxml.html import fromstring from lib import db, DB, logger from lib.util.text import to_unicode extra = {'classname': 'util.net.fetch()'} try: uo = urllib.urlopen(payload['url']) if (uo.code != 200): raise IOError("HTTP response code=%d from %s" % (uo.code, uo.url)) portal = get_portal(uo.url) if portal: break_portal(portal, payload, uo) else: payload['src'] = uo.read() payload['url_read'] = uo.url except Exception as e: # 抓取出錯,留待記錄 (save_fetch) payload['src'] = 'error ' + unicode(e) payload['category'] = 'error' payload['exception'] = e if 'url_read' not in payload: payload['url_read'] = payload['url'] if dbi is None: _dbi = DB() else: _dbi = dbi try: db.save_fetch(payload['url'], to_unicode(payload['src']), payload['category'], dbi = _dbi) except Exception as e: logger.warning('DB save_fetch failed for url %s' % payload['url'], extra=extra) logger.debug(e) if dbi is None: _dbi.disconnect() if 'error' == payload['category']: # raise the exception to skip the parsing process logger.warning("failed fetching %s" % payload['url'], extra=extra) raise payload['exception'] return payload
def dispatch_rss_2_0(self, payload, pool, dbi): """解析 XML 格式的 RSS feed, 打包 meta 轉送給 fetcher 之格式為 { "feed_url": '', "title": '', "pub_date": '' } @endpoint """ from xml.dom import minidom from lib import logger, db try: dom = minidom.parseString(payload['src']) except: logger.warning('failed parsing %s', payload['url'], extra={'classname': self.__class__}) pool.log_stats('error_parse') return proc_list = [] urls = [] for entry in dom.getElementsByTagName(self._parser['holder']): meta = {"feed_url": payload['url']} for tag in self._parser['extracts']: txt = self.getTextByTagName(entry, tag) if (txt): key = self._parser['extracts'][tag]["key"] if 'callback' in self._parser['extracts'][tag]: meta[key] = self._parser['extracts'][tag]['callback']( txt) else: meta[key] = txt url = meta['url'].encode('utf-8') del (meta['url']) # 檢查 url 是否轉碼妥善 (urlencode) if (any(map(lambda (x): x > 127, [ord(x) for x in url]))): if (url.startswith('http://') or url.startswith('https://')): url = url[:7] + quote(url[7:]) else: url = quote(url) proc_list.append({'url': url, 'meta': meta}) urls.append(url)
def dispatch_rss_2_0(self, payload, pool, dbi): """解析 XML 格式的 RSS feed, 打包 meta 轉送給 fetcher 之格式為 { "feed_url": '', "title": '', "pub_date": '' } @endpoint """ from xml.dom import minidom from lib import logger, db try: dom = minidom.parseString(payload['src']) except: logger.warning('failed parsing %s', payload['url'], extra={'classname': self.__class__}) pool.log_stats('error_parse') return proc_list = [] urls = [] for entry in dom.getElementsByTagName(self._parser['holder']): meta = {"feed_url": payload['url']} for tag in self._parser['extracts']: txt = self.getTextByTagName(entry, tag) if (txt): key = self._parser['extracts'][tag]["key"] if 'callback' in self._parser['extracts'][tag]: meta[key] = self._parser['extracts'][tag]['callback'](txt) else: meta[key] = txt url = meta['url'].encode('utf-8') del (meta['url']) # 檢查 url 是否轉碼妥善 (urlencode) if (any(map(lambda(x): x > 127, [ord(x) for x in url]))): if (url.startswith('http://') or url.startswith('https://')): url = url[:7] + quote(url[7:]) else: url = quote(url) proc_list.append({'url': url, 'meta': meta}) urls.append(url)
def postprocess( results: List[Tuple[Strace, List[ScoringResult]]] ) -> List[Tuple[MigrationResult, ValidationResult]]: """Postprocess scoring results. Parameters ---------- results : List[Tuple[Strace, List[ScoringResult]]] Results as returned by ``run``. Returns ------- Tuple[MigrationResult, ValidationResult]] """ # Process each set of results. search_results = [] for strace, scores in results: logger.info(f'Postprocessing: {strace.executable_repr}') try: # Get setup used for original executable tracing. setup = next( exe for exe in DOCKERFILE_EXECUTABLES if (exe.system == strace.system and exe.executable == strace. executable and exe.arguments == strace.arguments)).setup search_result = search.search_for_migration(strace, scores, setup=setup) if search_result is None: logger.warning( f'No migrations were returned while postprocessing ' f'`{strace.executable_repr}`, it will not be included in ' f'results.') else: search_results.append(search_result) except Exception: logger.exception( f'Encountered exception while postprocessing ' f'`{strace.executable_repr}`, it will not be included in ' f'results') return search_results
def loop(gui: GUI): while not gui.stopped: try: if 's0urce.io' not in hook.driver.current_url: hook.driver.get("http://s0urce.io") if gui.AutoTypeEnable.enabled: AutoType.call(gui.AutoTypeKeyDelay.value, gui.AutoTypeReturnDelay.value) if gui.AutoHackMsgEnable.enabled and not AutoHackMsg.sent_msg: AutoHackMsg.call(gui.AutoHackMsgType.get_selected_index()) """ if gui.AutoTarget.enabled: AutoTarget.call(gui.TargetPriority.get_selected_index()) """ if gui.AutoPort.enabled and not AutoPort.port_clicked: AutoPort.call(gui.PortSelection.get_selected_index(), gui.PortDelay.value) except selenium_exceptions.WebDriverException as e: logger.warning("Ignoring WebDriverException:{}".format(e))
def parse(*args, **kwargs) -> Generator[Strace, None, None]: """Parse straces. Yields ------ Strace Parsed strace. """ # Get all trace directories strace_dirs = list(OUTPUT_DIR.glob('*/')) # Do nothing if none exist. if not strace_dirs: return # Process each output. logger.info(f'Parsing traces for {COLLECTOR_NAME}...') for strace_dir in OUTPUT_DIR.glob('*/'): # Get path to strace file. strace_file = strace_dir / 'strace.txt' if not strace_file.exists(): logger.warning(f'No strace file for {strace_dir.name}') continue # Get path to metadata file. metadata_file = strace_dir / 'metadata.json' if not metadata_file.exists(): logger.warning(f'No metadata file for {strace_dir.name}') continue # Load metadata. with open(metadata_file) as fd: metadata = json.load(fd) # Skip parsing if execution failed. if metadata['returncode']: logger.warning('Executable execution failed, skipping') continue # Parse. logger.info(f'Parsing {strace_dir.name}') yield ( parser.parse( strace_file, system=metadata['system'], executable=metadata['executable'], arguments=metadata['arguments'], collector=COLLECTOR_NAME, collector_assigned_id=strace_dir.name, strace_file=strace_file, ) .normalize() ) logger.info('Done.')
def dispatch_response(self, payload, pool, dbi): """ 處理 fetcher 傳回之資料,調用 parse_response 解析其內容並儲存。 輸入 payload 格式為 { 'src': 'RESPONSE_BODY', 'meta': { 'feed_url': '', 'pub_date': 'str' } } 輸出為 { 'html': lxml tree } @endpoint """ import lxml.html from lib import logger, util, db from lib.util.dt import to_timestamp from lib.util.text import to_unicode if not payload: pool.log_stats('error_fetch') return try: payload['pub_ts'] = to_timestamp(payload['meta']['pub_date']) except KeyError: pass # dom tree 前處理 try: html = lxml.html.fromstring( payload['src']) # lxml handles html encoding payload['src'] = to_unicode( payload['src']) # conver to unicode before storing except: extra = {'classname': self.__class__} logger.warning("HTML parse error, url: %s", payload['url_read'], extra=extra) logger.info("Got: %s", payload['src'], extra=extra) pool.log_stats('error_parse') return # canonical url url_canonical = html.cssselect('link[rel=canonical]') payload['url_canonical'] = url_canonical[0].attrib['href'] \ if len(url_canonical) > 0 else payload['url_read'] # 移除 charset 因為保證是 unicode; 若未移除反而可能使 html parser 誤判 tags = html.cssselect('meta[http-equiv=Content-Type]') if (len(tags) > 0): payload['meta']['Content-Type'] = tags[0].attrib['content'] for x in tags: x.drop_tree() payload['html'] = html self.move_out_of_meta(payload, 'feed_url') article = self.parse_response(payload) if article: # parsed successfully self._decorate_article(article) db.save_article(article, dbi=dbi) pool.log_stats('done_article') else: # TODO: 還是寫入 article 表 db.save_response(payload, dbi=dbi) pool.log_stats('error_parse')
def collect_straces(untraced: Optional[list] = None, ): """Collect straces for all untraced executables. By default, collect_straces will collect straces for all executables in the `untraced_executables` table, provided that there is not a matching definition in the `executables` table that has at least one strace. Parameters ---------- untraced : Optional[list] List of executable definitions. Each object must have `system`, `executable`, `arguments_hash`, and `arguments` attributes. They may also have a `setup` attribute, containing a shell script to be run in the trace container for setup prior to running the trace. If provided, it will be used as the source list of untraced executables. It will still be filtered to exclude those that already have a trace in the cartographer database. """ # Get filtered executable definitions. # Convert to a list of RowProxy so we can use len. logger.info('Getting untraced executables...') untraced = _get_untraced_executables(subset=untraced) logger.info(f'{len(untraced)} unique untraced executables discovered.') # Clean output directory. if OUTPUT_DIR.exists(): shutil.rmtree(OUTPUT_DIR) OUTPUT_DIR.mkdir(parents=True) # If there are no untraced executables, exit after cleaning the output # directory. This means that a subsequent call to parse will be a noop. if not untraced: return # Trace each executable. logger.info('Tracing executables...') for executable in untraced: # Unpack system = executable.system binary = executable.executable arguments_hash = executable.arguments_hash.hex() arguments = executable.arguments setup = getattr(executable, 'setup', '') # Get command string. cmd_str = shell.join([binary, *arguments]) # Path for output files. strace_dir = ( OUTPUT_DIR / f'{binary.strip("/").replace("/", "_")}_{arguments_hash}' ) strace_dir.mkdir() metadata_file = strace_dir / 'metadata.json' # Trace. logger.info( f'Tracing {executable.executable} {executable.arguments} ' f'({strace_dir.name}).' ) logger.info(f' Command string: {cmd_str}') try: subprocess.run( [ 'docker', 'run', '--privileged', '--rm', '-it', '-v', f'{strace_dir}:/traces', DOCKER_IMAGE, cmd_str, 'strace.txt', setup ], capture_output=True, check=True ) returncode = 0 except subprocess.CalledProcessError as e: logger.warning( f'Trace failed.\n' f' stdout: {e.stdout}\n' f' stderr: {e.stderr}' ) returncode = e.returncode # Write metadata. with open(metadata_file, 'w') as fd: json.dump( { 'returncode': returncode, 'system': system, 'executable': binary, 'arguments_hash': arguments_hash, 'arguments': arguments, }, fd ) logger.info('Done.')
def parse(output_dir: Path, start_at: str = None) -> Generator[Strace, None, None]: """Parse traces from an ansible playbook. Parameters ---------- output_dir : Path Location of the collection output directory. start_at : str Dataset to start at. Yields ------ Strace Parsed strace. """ # Get the final output directory. output_dir = TRACE_DIR / output_dir output_dir_name = output_dir.stem logger.info(f'Parsing straces for {COLLECTOR_NAME} at {output_dir}') # Module output directories in output_dir module_directories = list( sorted((d for d in output_dir.glob('*') if d.is_dir()), key=lambda d: int(d.stem))) # Advance until finding the start at position if start_at is not None: while module_directories and module_directories[0].name != start_at: module_directories.pop(0) if not module_directories: logger.warning('Start-at skipped all traces.') # Process each module for module_dir in module_directories: # Read module metadata with open(module_dir / 'metadata.json') as metadata_fd: metadata = json.load(metadata_fd) logger.info(f'Parsing strace {metadata["index"]}: {metadata["name"]}') # Get execution result result = metadata['result'] # Skip failed modules if result.get('rc', 0): logger.warning('Module execution failed, skipping.') continue # Warn if not changed. Still parsing these for now. stdout = result.get('stdout', None) if isinstance(stdout, dict) and not stdout.get('changed', False): logger.warning('Module execution did not change system state.') # Log arguments. arg_str = json.dumps(metadata["args"], indent=4, sort_keys=True) logger.info(f'Definition:\n{metadata["module"]} {arg_str}') # Get strace file path strace_file = module_dir / 'strace.txt' # Skip if an strace file is not available. # This can happen due to permissions issues in tracing. if not os.access(strace_file, os.R_OK): logger.warning('Cannot read strace file, skipping') continue # Parse logger.info('Parsing strace file...') strace = parser.parse( strace_file, system='ansible', executable=metadata['module'], arguments=metadata['args'], collector=COLLECTOR_NAME, collector_assigned_id=f'{output_dir_name}/{metadata["index"]}', strace_file=strace_file, metadata=metadata, ) # Normalize logger.info('Normalizing strace...') strace = strace.normalize() # Log parsing completion and yield parsed and normalized trace logger.info('Done') yield strace
from lib import logger import lib import sys logger.error('param:%s', sys.argv[0]) logger.error('Hello') logger.info('creating an instance of auxiliary_module.Auxiliary') logger.error('created an instance of auxiliary_module.Auxiliary') logger.info('calling auxiliary_module.Auxiliary.do_something') logger.debug('finished auxiliary_module--989-') logger.warning('calling auxiliary_module.some_function()') for s in range(1): logger.info('This is message %d'%s) logger.info('done with auxiliary_module.some_function()') logger.debug('This is debug log.') logger.info('---------------------\r\n') def test(): logger.error('Shuo dian sha hao ne?') return def hi(): test() return print "1"
def dispatch_response(self, payload, pool, dbi): """ 處理 fetcher 傳回之資料,調用 parse_response 解析其內容並儲存。 輸入 payload 格式為 { 'src': 'RESPONSE_BODY', 'meta': { 'feed_url': '', 'pub_date': 'str' } } 輸出為 { 'html': lxml tree } @endpoint """ import lxml.html from lib import logger, util, db from lib.util.dt import to_timestamp from lib.util.text import to_unicode try: payload['pub_ts'] = to_timestamp(payload['meta']['pub_date']) except KeyError: pass # dom tree 前處理 try: html = lxml.html.fromstring(payload['src']) # lxml handles html encoding payload['src'] = to_unicode(payload['src']) # conver to unicode before storing except: extra = {'classname': self.__class__} logger.warning("HTML parse error, url: %s", payload['url_read'], extra=extra) logger.info("Got: %s", payload['src'], extra=extra) pool.log_stats('error_parse') return # canonical url url_canonical = html.cssselect('link[rel=canonical]') payload['url_canonical'] = url_canonical[0].attrib['href'] \ if len(url_canonical) > 0 else payload['url_read'] # 移除 charset 因為保證是 unicode; 若未移除反而可能使 html parser 誤判 tags = html.cssselect('meta[http-equiv=Content-Type]') if (len(tags) > 0): payload['meta']['Content-Type'] = tags[0].attrib['content'] for x in tags: x.drop_tree() payload['html'] = html self.move_out_of_meta(payload, 'feed_url') article = self.parse_response(payload) if article: # parsed successfully self._decorate_article(article) db.save_article(article, dbi = dbi) pool.log_stats('done_article') else: # TODO: 還是寫入 article 表 db.save_response(payload, dbi = dbi) pool.log_stats('error_parse')