コード例 #1
0
    def feed(self, pool, dbi):
        from lib import db, logger

        Ctlr_Base.feed(self, pool, dbi)
        extra = {'classname': self.__class__}

        for rss in self._my_feeds:
            if ('title' not in rss):
                rss['title'] = None

            if not ('url' in rss and rss['url']):
                logger.warning('Bad rss host url for %s(%s)',
                               rss['title'],
                               rss.get('url', None),
                               extra=extra)
                continue

            if ('host_url' not in rss):
                rss['host_url'] = self.get_host()['url']

            db.save_feed(rss, dbi=dbi)
            db.save_ctlr_feed(
                {
                    'url': rss['url'],
                    'classname': str(self.__class__)
                }, dbi=dbi)

            logger.info('%s queued', rss['url'], extra=extra)

            pool.put(rss['url'],
                     self.dispatch_rss_2_0,
                     category=self._parser['format'])
コード例 #2
0
ファイル: rss_2_0.py プロジェクト: clifflu/news-diff
  def feed(self, pool, dbi):
    from lib import db, logger

    Ctlr_Base.feed(self, pool, dbi)
    extra={'classname': self.__class__}

    for rss in self._my_feeds:
      if ('title' not in rss):
        rss['title'] = None
      
      if not ('url' in rss and rss['url']):
        logger.warning('Bad rss host url for %s(%s)', rss['title'], rss.get('url', None), extra=extra)
        continue

      if ('host_url' not in rss):
        rss['host_url'] = self.get_host()['url']

      db.save_feed(rss, dbi = dbi)
      db.save_ctlr_feed({
        'url': rss['url'],
        'classname': str(self.__class__)
      }, dbi = dbi)

      logger.info('%s queued', rss['url'], extra=extra)

      pool.put(rss['url'], self.dispatch_rss_2_0, category = self._parser['format'])
コード例 #3
0
ファイル: AutoType.py プロジェクト: sserve-kr/TypeSense
def detect_word():
    global previous_word
    if 'template.png' in hook.get_image_url():
        return ''
    if hook.get_image_url() not in word_dictionary:
        logger.warning('Unregistered word image: ' + hook.get_image_url())
        return ''
    logger.info('Get word ' + word_dictionary[hook.get_image_url()])
    return word_dictionary[hook.get_image_url()]
コード例 #4
0
def fetch(payload, dbi=None):
    """抓取 payload['url'] 的檔案
  並將最終讀取到的 url 寫入 payload['url_read'], response 寫入 payload['src']
  """
    import re
    from lxml.html import fromstring

    from lib import db, DB, logger
    from lib.util.text import to_unicode

    extra = {'classname': 'util.net.fetch()'}

    try:
        uo = urlopen(payload['url'], timeout=HTTP_TIMEOUT)
        if (uo.code != 200):
            raise IOError("HTTP response code=%d from %s" % (uo.code, uo.url))

        portal = get_portal(uo.url)
        if portal:
            break_portal(portal, payload, uo)
        else:
            payload['src'] = uo.read()
            payload['url_read'] = uo.url
    except Exception as e:
        # 抓取出錯,留待記錄 (save_fetch)
        payload['src'] = 'error ' + unicode(e)
        payload['category'] = 'error'
        payload['exception'] = e

    if 'url_read' not in payload:
        payload['url_read'] = payload['url']

    if dbi is None: _dbi = DB()
    else: _dbi = dbi

    try:
        db.save_fetch(payload['url'],
                      to_unicode(payload['src']),
                      payload['category'],
                      dbi=_dbi)
    except Exception as e:
        logger.warning('DB save_fetch failed for url %s' % payload['url'],
                       extra=extra)
        logger.debug(e)

    if dbi is None: _dbi.disconnect()

    if 'error' == payload['category']:
        # raise the exception to skip the parsing process
        logger.info("failed fetching %s" % payload['url'], extra=extra)
        raise payload['exception']

    return payload
コード例 #5
0
ファイル: net.py プロジェクト: dehao/news-diff
def fetch(payload, dbi = None):
  """抓取 payload['url'] 的檔案
  並將最終讀取到的 url 寫入 payload['url_read'], response 寫入 payload['src']
  """
  import re
  from lxml.html import fromstring

  from lib import db, DB, logger
  from lib.util.text import to_unicode

  extra = {'classname': 'util.net.fetch()'}

  try:
    uo = urllib.urlopen(payload['url'])
    if (uo.code != 200):
      raise IOError("HTTP response code=%d from %s" % (uo.code, uo.url))

    portal = get_portal(uo.url)
    if portal:
      break_portal(portal, payload, uo)
    else:
      payload['src'] = uo.read()
      payload['url_read'] = uo.url
  except Exception as e:
    # 抓取出錯,留待記錄 (save_fetch)
    payload['src'] = 'error ' + unicode(e)
    payload['category'] = 'error'
    payload['exception'] = e

  if 'url_read' not in payload:
    payload['url_read'] = payload['url']

  if dbi is None: _dbi = DB()
  else: _dbi = dbi
  
  try:
    db.save_fetch(payload['url'], to_unicode(payload['src']), payload['category'], dbi = _dbi)
  except Exception as e:
    logger.warning('DB save_fetch failed for url %s' % payload['url'], extra=extra)
    logger.debug(e)
  
  if dbi is None: _dbi.disconnect()

  if 'error' == payload['category']:
    # raise the exception to skip the parsing process
    logger.warning("failed fetching %s" % payload['url'], extra=extra)
    raise payload['exception']

  return payload
コード例 #6
0
    def dispatch_rss_2_0(self, payload, pool, dbi):
        """解析 XML 格式的 RSS feed, 打包 meta 轉送給 fetcher 之格式為 {
      "feed_url": '',
      "title": '',
      "pub_date": ''
    }

    @endpoint
    """
        from xml.dom import minidom

        from lib import logger, db

        try:
            dom = minidom.parseString(payload['src'])
        except:
            logger.warning('failed parsing %s',
                           payload['url'],
                           extra={'classname': self.__class__})
            pool.log_stats('error_parse')
            return

        proc_list = []
        urls = []

        for entry in dom.getElementsByTagName(self._parser['holder']):
            meta = {"feed_url": payload['url']}
            for tag in self._parser['extracts']:
                txt = self.getTextByTagName(entry, tag)
                if (txt):
                    key = self._parser['extracts'][tag]["key"]

                    if 'callback' in self._parser['extracts'][tag]:
                        meta[key] = self._parser['extracts'][tag]['callback'](
                            txt)
                    else:
                        meta[key] = txt

            url = meta['url'].encode('utf-8')
            del (meta['url'])

            # 檢查 url 是否轉碼妥善 (urlencode)
            if (any(map(lambda (x): x > 127, [ord(x) for x in url]))):
                if (url.startswith('http://') or url.startswith('https://')):
                    url = url[:7] + quote(url[7:])
                else:
                    url = quote(url)
            proc_list.append({'url': url, 'meta': meta})
            urls.append(url)
コード例 #7
0
ファイル: rss_2_0.py プロジェクト: clifflu/news-diff
  def dispatch_rss_2_0(self, payload, pool, dbi):
    """解析 XML 格式的 RSS feed, 打包 meta 轉送給 fetcher 之格式為 {
      "feed_url": '',
      "title": '',
      "pub_date": ''
    }

    @endpoint
    """
    from xml.dom import minidom

    from lib import logger, db

    try:
      dom = minidom.parseString(payload['src'])
    except:
      logger.warning('failed parsing %s', payload['url'], extra={'classname': self.__class__})
      pool.log_stats('error_parse')
      return

    proc_list = []
    urls = []

    for entry in dom.getElementsByTagName(self._parser['holder']):
      meta = {"feed_url": payload['url']}
      for tag in self._parser['extracts']:
        txt = self.getTextByTagName(entry, tag)
        if (txt):
          key = self._parser['extracts'][tag]["key"]

          if 'callback' in self._parser['extracts'][tag]:
            meta[key] = self._parser['extracts'][tag]['callback'](txt)
          else:
            meta[key] = txt

      url = meta['url'].encode('utf-8')
      del (meta['url'])

      # 檢查 url 是否轉碼妥善 (urlencode)
      if (any(map(lambda(x): x > 127, [ord(x) for x in url]))):
        if (url.startswith('http://') or url.startswith('https://')):
          url = url[:7] + quote(url[7:])
        else:
          url = quote(url)
      proc_list.append({'url': url, 'meta': meta})
      urls.append(url)
コード例 #8
0
def postprocess(
    results: List[Tuple[Strace, List[ScoringResult]]]
) -> List[Tuple[MigrationResult, ValidationResult]]:
    """Postprocess scoring results.

    Parameters
    ----------
    results : List[Tuple[Strace, List[ScoringResult]]]
        Results as returned by ``run``.

    Returns
    -------
    Tuple[MigrationResult, ValidationResult]]
    """
    # Process each set of results.
    search_results = []
    for strace, scores in results:

        logger.info(f'Postprocessing: {strace.executable_repr}')

        try:
            # Get setup used for original executable tracing.
            setup = next(
                exe for exe in DOCKERFILE_EXECUTABLES
                if (exe.system == strace.system and exe.executable == strace.
                    executable and exe.arguments == strace.arguments)).setup
            search_result = search.search_for_migration(strace,
                                                        scores,
                                                        setup=setup)
            if search_result is None:
                logger.warning(
                    f'No migrations were returned while postprocessing '
                    f'`{strace.executable_repr}`, it will not be included in '
                    f'results.')
            else:
                search_results.append(search_result)
        except Exception:
            logger.exception(
                f'Encountered exception while postprocessing '
                f'`{strace.executable_repr}`, it will not be included in '
                f'results')

    return search_results
コード例 #9
0
def loop(gui: GUI):
    while not gui.stopped:
        try:
            if 's0urce.io' not in hook.driver.current_url:
                hook.driver.get("http://s0urce.io")
            if gui.AutoTypeEnable.enabled:
                AutoType.call(gui.AutoTypeKeyDelay.value,
                              gui.AutoTypeReturnDelay.value)
            if gui.AutoHackMsgEnable.enabled and not AutoHackMsg.sent_msg:
                AutoHackMsg.call(gui.AutoHackMsgType.get_selected_index())
            """
            if gui.AutoTarget.enabled:
                AutoTarget.call(gui.TargetPriority.get_selected_index())
            """
            if gui.AutoPort.enabled and not AutoPort.port_clicked:
                AutoPort.call(gui.PortSelection.get_selected_index(),
                              gui.PortDelay.value)
        except selenium_exceptions.WebDriverException as e:
            logger.warning("Ignoring WebDriverException:{}".format(e))
コード例 #10
0
ファイル: __init__.py プロジェクト: newcanopies/cartographer
def parse(*args, **kwargs) -> Generator[Strace, None, None]:
    """Parse straces.

    Yields
    ------
    Strace
        Parsed strace.
    """
    # Get all trace directories
    strace_dirs = list(OUTPUT_DIR.glob('*/'))

    # Do nothing if none exist.
    if not strace_dirs:
        return

    # Process each output.
    logger.info(f'Parsing traces for {COLLECTOR_NAME}...')
    for strace_dir in OUTPUT_DIR.glob('*/'):

        # Get path to strace file.
        strace_file = strace_dir / 'strace.txt'
        if not strace_file.exists():
            logger.warning(f'No strace file for {strace_dir.name}')
            continue

        # Get path to metadata file.
        metadata_file = strace_dir / 'metadata.json'
        if not metadata_file.exists():
            logger.warning(f'No metadata file for {strace_dir.name}')
            continue

        # Load metadata.
        with open(metadata_file) as fd:
            metadata = json.load(fd)

        # Skip parsing if execution failed.
        if metadata['returncode']:
            logger.warning('Executable execution failed, skipping')
            continue

        # Parse.
        logger.info(f'Parsing {strace_dir.name}')
        yield (
            parser.parse(
                strace_file,
                system=metadata['system'],
                executable=metadata['executable'],
                arguments=metadata['arguments'],
                collector=COLLECTOR_NAME,
                collector_assigned_id=strace_dir.name,
                strace_file=strace_file,
            )
            .normalize()
        )

    logger.info('Done.')
コード例 #11
0
    def dispatch_response(self, payload, pool, dbi):
        """
    處理 fetcher 傳回之資料,調用 parse_response 解析其內容並儲存。

    輸入 payload 格式為 {
      'src': 'RESPONSE_BODY',
      'meta': {
        'feed_url': '',
        'pub_date': 'str'
      }
    }
    輸出為 {
      'html': lxml tree
    }

    @endpoint
    """
        import lxml.html
        from lib import logger, util, db
        from lib.util.dt import to_timestamp
        from lib.util.text import to_unicode

        if not payload:
            pool.log_stats('error_fetch')
            return

        try:
            payload['pub_ts'] = to_timestamp(payload['meta']['pub_date'])
        except KeyError:
            pass

        # dom tree 前處理
        try:
            html = lxml.html.fromstring(
                payload['src'])  # lxml handles html encoding
            payload['src'] = to_unicode(
                payload['src'])  # conver to unicode before storing
        except:
            extra = {'classname': self.__class__}
            logger.warning("HTML parse error, url: %s",
                           payload['url_read'],
                           extra=extra)
            logger.info("Got: %s", payload['src'], extra=extra)
            pool.log_stats('error_parse')
            return

        # canonical url
        url_canonical = html.cssselect('link[rel=canonical]')
        payload['url_canonical'] = url_canonical[0].attrib['href'] \
          if len(url_canonical) > 0 else payload['url_read']

        # 移除 charset 因為保證是 unicode; 若未移除反而可能使 html parser 誤判
        tags = html.cssselect('meta[http-equiv=Content-Type]')
        if (len(tags) > 0):
            payload['meta']['Content-Type'] = tags[0].attrib['content']
            for x in tags:
                x.drop_tree()

        payload['html'] = html

        self.move_out_of_meta(payload, 'feed_url')

        article = self.parse_response(payload)

        if article:
            # parsed successfully
            self._decorate_article(article)
            db.save_article(article, dbi=dbi)
            pool.log_stats('done_article')
        else:
            # TODO: 還是寫入 article 表
            db.save_response(payload, dbi=dbi)
            pool.log_stats('error_parse')
コード例 #12
0
ファイル: __init__.py プロジェクト: newcanopies/cartographer
def collect_straces(untraced: Optional[list] = None, ):
    """Collect straces for all untraced executables.

    By default, collect_straces will collect straces for all executables in
    the `untraced_executables` table, provided that there is not a matching
    definition in the `executables` table that has at least one strace.

    Parameters
    ----------
    untraced : Optional[list]
        List of executable definitions. Each object must have `system`,
        `executable`, `arguments_hash`, and `arguments` attributes. They may
        also have a `setup` attribute, containing a shell script to be run in
        the trace container for setup prior to running the trace.

        If provided, it will be used as the source list of untraced
        executables. It will still be filtered to exclude those that already
        have a trace in the cartographer database.
    """
    # Get filtered executable definitions.
    # Convert to a list of RowProxy so we can use len.
    logger.info('Getting untraced executables...')
    untraced = _get_untraced_executables(subset=untraced)
    logger.info(f'{len(untraced)} unique untraced executables discovered.')

    # Clean output directory.
    if OUTPUT_DIR.exists():
        shutil.rmtree(OUTPUT_DIR)
    OUTPUT_DIR.mkdir(parents=True)

    # If there are no untraced executables, exit after cleaning the output
    # directory. This means that a subsequent call to parse will be a noop.
    if not untraced:
        return

    # Trace each executable.
    logger.info('Tracing executables...')
    for executable in untraced:

        # Unpack
        system = executable.system
        binary = executable.executable
        arguments_hash = executable.arguments_hash.hex()
        arguments = executable.arguments
        setup = getattr(executable, 'setup', '')

        # Get command string.
        cmd_str = shell.join([binary, *arguments])

        # Path for output files.
        strace_dir = (
            OUTPUT_DIR
            / f'{binary.strip("/").replace("/", "_")}_{arguments_hash}'
        )
        strace_dir.mkdir()
        metadata_file = strace_dir / 'metadata.json'

        # Trace.
        logger.info(
            f'Tracing {executable.executable} {executable.arguments} '
            f'({strace_dir.name}).'
        )
        logger.info(f'    Command string: {cmd_str}')
        try:
            subprocess.run(
                [
                    'docker', 'run', '--privileged', '--rm', '-it',
                    '-v', f'{strace_dir}:/traces', DOCKER_IMAGE,
                    cmd_str, 'strace.txt', setup
                ],
                capture_output=True,
                check=True
            )
            returncode = 0
        except subprocess.CalledProcessError as e:
            logger.warning(
                f'Trace failed.\n'
                f'    stdout: {e.stdout}\n'
                f'    stderr: {e.stderr}'
            )
            returncode = e.returncode

        # Write metadata.
        with open(metadata_file, 'w') as fd:
            json.dump(
                {
                    'returncode': returncode,
                    'system': system,
                    'executable': binary,
                    'arguments_hash': arguments_hash,
                    'arguments': arguments,
                },
                fd
            )

    logger.info('Done.')
コード例 #13
0
def parse(output_dir: Path,
          start_at: str = None) -> Generator[Strace, None, None]:
    """Parse traces from an ansible playbook.

    Parameters
    ----------
    output_dir : Path
        Location of the collection output directory.
    start_at : str
        Dataset to start at.

    Yields
    ------
    Strace
        Parsed strace.
    """
    # Get the final output directory.
    output_dir = TRACE_DIR / output_dir
    output_dir_name = output_dir.stem

    logger.info(f'Parsing straces for {COLLECTOR_NAME} at {output_dir}')

    # Module output directories in output_dir
    module_directories = list(
        sorted((d for d in output_dir.glob('*') if d.is_dir()),
               key=lambda d: int(d.stem)))

    # Advance until finding the start at position
    if start_at is not None:
        while module_directories and module_directories[0].name != start_at:
            module_directories.pop(0)
        if not module_directories:
            logger.warning('Start-at skipped all traces.')

    # Process each module
    for module_dir in module_directories:

        # Read module metadata
        with open(module_dir / 'metadata.json') as metadata_fd:
            metadata = json.load(metadata_fd)

        logger.info(f'Parsing strace {metadata["index"]}: {metadata["name"]}')

        # Get execution result
        result = metadata['result']

        # Skip failed modules
        if result.get('rc', 0):
            logger.warning('Module execution failed, skipping.')
            continue

        # Warn if not changed. Still parsing these for now.
        stdout = result.get('stdout', None)
        if isinstance(stdout, dict) and not stdout.get('changed', False):
            logger.warning('Module execution did not change system state.')

        # Log arguments.
        arg_str = json.dumps(metadata["args"], indent=4, sort_keys=True)
        logger.info(f'Definition:\n{metadata["module"]} {arg_str}')

        # Get strace file path
        strace_file = module_dir / 'strace.txt'

        # Skip if an strace file is not available.
        # This can happen due to permissions issues in tracing.
        if not os.access(strace_file, os.R_OK):
            logger.warning('Cannot read strace file, skipping')
            continue

        # Parse
        logger.info('Parsing strace file...')
        strace = parser.parse(
            strace_file,
            system='ansible',
            executable=metadata['module'],
            arguments=metadata['args'],
            collector=COLLECTOR_NAME,
            collector_assigned_id=f'{output_dir_name}/{metadata["index"]}',
            strace_file=strace_file,
            metadata=metadata,
        )

        # Normalize
        logger.info('Normalizing strace...')
        strace = strace.normalize()

        # Log parsing completion and yield parsed and normalized trace
        logger.info('Done')
        yield strace
コード例 #14
0
from lib import logger
import lib
import sys


logger.error('param:%s', sys.argv[0])


logger.error('Hello')
logger.info('creating an instance of auxiliary_module.Auxiliary')
logger.error('created an instance of auxiliary_module.Auxiliary')
logger.info('calling auxiliary_module.Auxiliary.do_something')
logger.debug('finished auxiliary_module--989-')
logger.warning('calling auxiliary_module.some_function()')
for s in range(1):
    logger.info('This is message %d'%s)
logger.info('done with auxiliary_module.some_function()')
logger.debug('This is debug log.')
logger.info('---------------------\r\n')

def test():
    logger.error('Shuo dian sha hao ne?')
    return
def hi():
    test()
    return

print "1"

コード例 #15
0
ファイル: base.py プロジェクト: dehao/news-diff
  def dispatch_response(self, payload, pool, dbi):
    """
    處理 fetcher 傳回之資料,調用 parse_response 解析其內容並儲存。

    輸入 payload 格式為 {
      'src': 'RESPONSE_BODY',
      'meta': {
        'feed_url': '',
        'pub_date': 'str'
      }
    }
    輸出為 {
      'html': lxml tree
    }

    @endpoint
    """
    import lxml.html
    from lib import logger, util, db
    from lib.util.dt import to_timestamp
    from lib.util.text import to_unicode

    try: payload['pub_ts'] = to_timestamp(payload['meta']['pub_date'])
    except KeyError: pass

    # dom tree 前處理
    try:
      html = lxml.html.fromstring(payload['src']) # lxml handles html encoding
      payload['src'] = to_unicode(payload['src']) # conver to unicode before storing
    except:
      extra = {'classname': self.__class__}
      logger.warning("HTML parse error, url: %s", payload['url_read'], extra=extra)
      logger.info("Got: %s", payload['src'], extra=extra)
      pool.log_stats('error_parse')
      return

    # canonical url
    url_canonical = html.cssselect('link[rel=canonical]')
    payload['url_canonical'] = url_canonical[0].attrib['href'] \
      if len(url_canonical) > 0 else payload['url_read']

    # 移除 charset 因為保證是 unicode; 若未移除反而可能使 html parser 誤判
    tags = html.cssselect('meta[http-equiv=Content-Type]')
    if (len(tags) > 0):
      payload['meta']['Content-Type'] = tags[0].attrib['content']
      for x in tags: x.drop_tree()

    payload['html'] = html

    self.move_out_of_meta(payload, 'feed_url')

    article = self.parse_response(payload)

    if article:
      # parsed successfully
      self._decorate_article(article)
      db.save_article(article, dbi = dbi)
      pool.log_stats('done_article')
    else:
      # TODO: 還是寫入 article 表
      db.save_response(payload, dbi = dbi)
      pool.log_stats('error_parse')