Ejemplo n.º 1
0
    def createGFSChunkSubDirs(self, reset_index=0):
        """
    Creates GFS subdirectories for base indexer and anchor processing
    Args:
      reset_index: 1, if this is for resetting index. Some dirs may exist
                   0, otherwise.
    Return:
      true  - successful
      false - otherwise
    """

        ##
        def execMultiTimes(self, to_exec, max_tries=10):
            """ exec a gfs command by trying multiple  times """
            try_num = 0
            while try_num < max_tries:
                err, out = E.run_fileutil_command(self.globalParams, to_exec)
                if E.ERR_OK == err:
                    return true
                time.sleep(10)
                try_num = try_num + 1
                logging.error("Error executing %s" % to_exec)
            return false

        ##

        def createGFSDir(self, dir_name, reset_index):
            """ create a GFS Dir.

      if this is for reset index, the dir may already
      exist. So just try it once and ignore any error.
      Args:
        dir_name: '/gfs/ent/feeds'
        reset_index: 1 - during reset index
                     0 - othewise
      Return:
        true  - successful
        false - otherwise
      """
            if reset_index:
                execMultiTimes(self, "mkdir -p %s" % dir_name, max_tries=1)
                return true
            else:
                return execMultiTimes(self, "mkdir -p %s" % dir_name)

        num_shards = self.globalParams.GetEntNumShards()
        namespace_prefix = self.getGlobalParam('NAMESPACE_PREFIX')
        pr_namespace_prefix = self.getGlobalParam(
            'OUTPUT_NAMESPACE_PREFIX')['pr_main']

        # This also waits for the gfs master to come up.
        if not execMultiTimes(
                self, "mkdir -p %s" % E.normpath("%s/tmp" % namespace_prefix),
                30):
            return false

        # Create the pr_main subdir for pr_main to write to.
        if not execMultiTimes(
                self, "mkdir -p %s" %
                E.normpath("%s/pr_main" % pr_namespace_prefix), 30):
            return false

        # Create FEEDS_DIR (feeds), a directory where uploaded feeds will get stored
        if not createGFSDir(self, self.getGlobalParam('FEEDS_DIR'),
                            reset_index):
            return false

        # Create FEED_STATUS_DIR (feedstatus), a directory to store feed status
        if not createGFSDir(self, self.getGlobalParam('FEED_STATUS_DIR'),
                            reset_index):
            return false

        # Create SSO_LOG_DIR, a directory to store sso logging output files
        if not createGFSDir(self, self.getGlobalParam('SSO_LOG_DIR'),
                            reset_index):
            return false

        # Create LOG_REPORT_DIR (log_report), a directory to
        # store raw and summary reports
        if not createGFSDir(self, self.getGlobalParam('LOG_REPORT_DIR'),
                            reset_index):
            return false

        # Create CRAWLQUEUE_DIR (crawlqueue), a directory to store crawlqueue data
        if not createGFSDir(self, self.getGlobalParam('CRAWLQUEUE_DIR'),
                            reset_index):
            return false

        # Create SYSLOG_CHECKPOINTS_DIR (syslog_checkpoints)
        # a directory to store syslog checkpoint files
        if not createGFSDir(self,
                            self.getGlobalParam('SYSLOG_CHECKPOINTS_DIR'),
                            reset_index):
            return false

        # Create directories for base_indexer, global_anchor and
        # global-link.
        dirs = [
            'base-indexer', 'global-anchor', 'global-link',
            'urlhistory-processor'
        ]
        for i in range(num_shards):
            for dir in dirs:
                if not execMultiTimes(
                        self, "mkdir -p %s%s%03d-of-%03d" %
                    (namespace_prefix, dir, i, num_shards)):
                    return false

            # Inside the base-indexer directories we create directories for
            # global anchor command  logs
            for j in range(num_shards):
                if not execMultiTimes(
                        self,
                        "mkdir -p %sbase-indexer%03d-of-%03d/global-anchor%03d-of-%03d"
                        % (namespace_prefix, i, num_shards, j, num_shards)):
                    return false

        num_tracker_shards = self.globalParams.GetNumShards(
            'urltracker_server')
        for i in range(num_tracker_shards):
            if not execMultiTimes(
                    self, "mkdir -p %surltracker%03d-of-%03d" %
                (namespace_prefix, i, num_tracker_shards)):
                return false

        return true
Ejemplo n.º 2
0
def get_top_dir(rootdir):
  return E.normpath('%s/export/hda3' % os.path.normpath(rootdir))
Ejemplo n.º 3
0
def _TopLevelDirsToRemove(cfg,
                          dir_root,
                          gfs_aliases=None,
                          datadir=None,
                          unrecoverable_dirs=[]):
    """ top level dirs/files under dir_root

  Trying to find out all the dirs/files under dir_root to be removed, given
  unrecoverable_dirs.
  Note:
    fileutil has different behavior for gfs, bigfile, and local files.
    Sometimes, the "fileuitl ls" will give errors without the "-a" flag.
    However, with the "-a" flag, some unwanted directories are also returned.
    So the idea here is to first try without the "-a" flag. If it fails,
    try the "-a" flag. For every dir it returns, make sure it is under
    dir_root.
  Parameters:
    cfg: entconfig.EntConfig(entHome, 1)
    dir_root: '/gfs/ent/'
    gfs_aliases: 'ent=master.ent.gfs.ent4-6-0-G-27.ls.google.com:3830'
                  -- for gfs only
    datadir: '/export/hda3/4.6.0.G.27/data/enterprise-data'
             -- for bigfile only
    unrecoverable_dirs: []
                        -- introduced for unittests.
  Returns:
    ['/bigfile/pr_stats_shard000', '/bigfile/pr_test_00_00', ...]
  """

    # get all the top level GFS dirs
    if gfs_aliases is not None:
        unrecoverable_dirs.extend(_UnrecoverableGFSDirs(cfg))
    if dir_root not in unrecoverable_dirs:
        unrecoverable_dirs.append(dir_root)
    # first try without the "-a" option. If it does not work, try the "-a" option
    cmd_args = _ComposeFileutilArgs(gfs_aliases, datadir)
    fileutil_args = ['', '-a']
    timeout = 20 * 60
    top_level_dirs_to_remove = []
    for fileutil_arg in fileutil_args:
        # unfortunately, fileutil in 4.6.4 works differently on gfs and bigfile.
        # it has to use "/gfs/ent/" and "/bigfile/*".
        if gfs_aliases is not None:
            cmd = ('fileutil %s ls %s %s' % (cmd_args, fileutil_arg, dir_root))
        else:
            cmd = ('fileutil %s ls %s %s*' %
                   (cmd_args, fileutil_arg, dir_root))
        out = []
        status = _ExecuteCommand(cmd,
                                 timeout=15 * 60,
                                 out=out,
                                 error_filter=_NoMatchOk)
        if status == 0 and len(out) >= 1:
            top_level_dirs = out[0].splitlines()
            # don't remove unrecoverable dirs. Also noise may exist in the result.
            for dir in top_level_dirs:
                if not dir.endswith('..'):
                    dir = E.normpath(dir)
                    if (dir.startswith(dir_root)
                            and _IsRecoverable(dir, unrecoverable_dirs)):
                        top_level_dirs_to_remove.append(dir)
            if len(top_level_dirs_to_remove) > 0:
                break
            else:
                timeout *= 2
    logging.info('Reset Index: Top Level Dirs to Remove:')
    logging.info('      %s' % top_level_dirs_to_remove)
    logging.flush()
    return top_level_dirs_to_remove
Ejemplo n.º 4
0
def _TopLevelDirsToRemove(cfg, dir_root, gfs_aliases=None, datadir=None,
                          unrecoverable_dirs=[]):
  """ top level dirs/files under dir_root

  Trying to find out all the dirs/files under dir_root to be removed, given
  unrecoverable_dirs.
  Note:
    fileutil has different behavior for gfs, bigfile, and local files.
    Sometimes, the "fileuitl ls" will give errors without the "-a" flag.
    However, with the "-a" flag, some unwanted directories are also returned.
    So the idea here is to first try without the "-a" flag. If it fails,
    try the "-a" flag. For every dir it returns, make sure it is under
    dir_root.
  Parameters:
    cfg: entconfig.EntConfig(entHome, 1)
    dir_root: '/gfs/ent/'
    gfs_aliases: 'ent=master.ent.gfs.ent4-6-0-G-27.ls.google.com:3830'
                  -- for gfs only
    datadir: '/export/hda3/4.6.0.G.27/data/enterprise-data'
             -- for bigfile only
    unrecoverable_dirs: []
                        -- introduced for unittests.
  Returns:
    ['/bigfile/pr_stats_shard000', '/bigfile/pr_test_00_00', ...]
  """

  # get all the top level GFS dirs
  if gfs_aliases is not None:
    unrecoverable_dirs.extend(_UnrecoverableGFSDirs(cfg))
  if dir_root not in unrecoverable_dirs:
    unrecoverable_dirs.append(dir_root)
  # first try without the "-a" option. If it does not work, try the "-a" option
  cmd_args = _ComposeFileutilArgs(gfs_aliases, datadir)
  fileutil_args = ['', '-a']
  timeout=20*60
  top_level_dirs_to_remove = []
  for fileutil_arg in fileutil_args:
    # unfortunately, fileutil in 4.6.4 works differently on gfs and bigfile.
    # it has to use "/gfs/ent/" and "/bigfile/*".
    if gfs_aliases is not None:
      cmd = ('fileutil %s ls %s %s' % (cmd_args, fileutil_arg, dir_root))
    else:
      cmd = ('fileutil %s ls %s %s*' % (cmd_args, fileutil_arg, dir_root))
    out = []
    status = _ExecuteCommand(cmd, timeout=15*60, out=out, error_filter=_NoMatchOk)
    if status == 0 and len(out) >= 1:
      top_level_dirs = out[0].splitlines()
      # don't remove unrecoverable dirs. Also noise may exist in the result.
      for dir in top_level_dirs:
        if not dir.endswith('..'):
          dir = E.normpath(dir)
          if (dir.startswith(dir_root) and
              _IsRecoverable(dir, unrecoverable_dirs)):
            top_level_dirs_to_remove.append(dir)
      if len(top_level_dirs_to_remove) > 0:
        break
      else:
        timeout *= 2
  logging.info('Reset Index: Top Level Dirs to Remove:')
  logging.info('      %s' % top_level_dirs_to_remove)
  logging.flush()
  return top_level_dirs_to_remove
Ejemplo n.º 5
0
  def createGFSChunkSubDirs(self, reset_index=0):
    """
    Creates GFS subdirectories for base indexer and anchor processing
    Args:
      reset_index: 1, if this is for resetting index. Some dirs may exist
                   0, otherwise.
    Return:
      true  - successful
      false - otherwise
    """
    ##
    def execMultiTimes(self, to_exec, max_tries=10):
      """ exec a gfs command by trying multiple  times """
      try_num = 0
      while try_num < max_tries:
        err, out = E.run_fileutil_command(self.globalParams, to_exec)
        if E.ERR_OK == err:
          return true
        time.sleep(10)
        try_num = try_num + 1
        logging.error("Error executing %s" % to_exec)
      return false
    ##

    def createGFSDir(self, dir_name, reset_index):
      """ create a GFS Dir.

      if this is for reset index, the dir may already
      exist. So just try it once and ignore any error.
      Args:
        dir_name: '/gfs/ent/feeds'
        reset_index: 1 - during reset index
                     0 - othewise
      Return:
        true  - successful
        false - otherwise
      """
      if reset_index:
        execMultiTimes(self, "mkdir -p %s" % dir_name, max_tries=1)
        return true
      else:
        return execMultiTimes(self, "mkdir -p %s" % dir_name)

    num_shards = self.globalParams.GetEntNumShards()
    namespace_prefix = self.getGlobalParam('NAMESPACE_PREFIX')
    pr_namespace_prefix = self.getGlobalParam('OUTPUT_NAMESPACE_PREFIX')['pr_main']

    # This also waits for the gfs master to come up.
    if not execMultiTimes(self, "mkdir -p %s" % E.normpath(
      "%s/tmp" % namespace_prefix), 30):
      return false

    # Create the pr_main subdir for pr_main to write to.
    if not execMultiTimes(self, "mkdir -p %s" % E.normpath(
      "%s/pr_main" % pr_namespace_prefix), 30):
      return false

    # Create FEEDS_DIR (feeds), a directory where uploaded feeds will get stored
    if not createGFSDir(self, self.getGlobalParam('FEEDS_DIR'), reset_index):
      return false

    # Create FEED_STATUS_DIR (feedstatus), a directory to store feed status
    if not createGFSDir(self, self.getGlobalParam('FEED_STATUS_DIR'),

                        reset_index):
      return false

    # Create SSO_LOG_DIR, a directory to store sso logging output files
    if not createGFSDir(self, self.getGlobalParam('SSO_LOG_DIR'),
                        reset_index):
      return false

    # Create LOG_REPORT_DIR (log_report), a directory to
    # store raw and summary reports
    if not createGFSDir(self, self.getGlobalParam('LOG_REPORT_DIR'),
                        reset_index):
      return false

    # Create CRAWLQUEUE_DIR (crawlqueue), a directory to store crawlqueue data
    if not createGFSDir(self, self.getGlobalParam('CRAWLQUEUE_DIR'),
                        reset_index):
      return false

    # Create SYSLOG_CHECKPOINTS_DIR (syslog_checkpoints)
    # a directory to store syslog checkpoint files
    if not createGFSDir(self, self.getGlobalParam('SYSLOG_CHECKPOINTS_DIR'),
                        reset_index):
      return false

    # Create directories for base_indexer, global_anchor and
    # global-link.
    dirs = ['base-indexer', 'global-anchor', 'global-link', 'urlhistory-processor']
    for i in range(num_shards):
      for dir in dirs:
        if not execMultiTimes(self, "mkdir -p %s%s%03d-of-%03d" % (
          namespace_prefix, dir, i, num_shards)):
          return false

      # Inside the base-indexer directories we create directories for
      # global anchor command  logs
      for j in range(num_shards):
        if not execMultiTimes(
          self,
          "mkdir -p %sbase-indexer%03d-of-%03d/global-anchor%03d-of-%03d" % (
          namespace_prefix, i, num_shards, j, num_shards)):
          return false

    num_tracker_shards = self.globalParams.GetNumShards('urltracker_server')
    for i in range(num_tracker_shards):
      if not execMultiTimes(self,
                            "mkdir -p %surltracker%03d-of-%03d" % (
        namespace_prefix, i, num_tracker_shards)):
        return false

    return true