Ejemplo n.º 1
0
def check_path(path, verbose=False, plugin=True, xof=True):
    """
    If plugin is True, we want to log and store, which tcc_report does by
    default so we leave those flags alone.

    If plugin is False, we're interactive and we want to write any report to
    stdout. However, we only make a report if 1) verbose is True, or 2) the
    counts don't match.
    """
    cosinfo = get_cos_info()
    nsobj = path_nsobject(path)
    try:
        bfl = get_bitfile_set(int(nsobj), 1)
    except U.HpssicError as e:
        if plugin:
            CrawlConfig.log(e.value)
            return
        elif xof:
            raise SystemExit(e.value)
        else:
            raise U.HpssicError(e.value)

    bf = U.pop0(bfl)
    sc_count = int(bf['SC_COUNT'])
    cos_count = int(cosinfo[bf['BFATTR_COS_ID']])

    if plugin and sc_count != cos_count:
        tcc_report(bf, path=path)
    elif not plugin and (verbose or sc_count != cos_count):
        print(tcc_report(bf, path=path, log=False, store=False))
Ejemplo n.º 2
0
    def verify(self, h):
        """
        Attempt to verify the current file.
        """
        CrawlConfig.log("hsi(%d) attempting to verify %s" %
                        (h.pid(), self.path))
        rsp = h.hashverify(self.path)

        if "TIMEOUT" in rsp or "ERROR" in rsp:
            rval = "skipped"
            self.set('fails', self.fails + 1)
            CrawlConfig.log(
                "hashverify transfer incomplete on %s -- skipping" % self.path)
            h.quit()
        elif "%s: (md5) OK" % self.path in rsp:
            rval = "matched"
            CrawlConfig.log("hashverify matched on %s" % self.path)
        elif "no valid checksum found" in rsp:
            if self.addable(self.cos):
                rval = self.add_to_sample(h)
            else:
                self.set('checksum', 0)
                rval = "skipped"
                CrawlConfig.log("hashverify skipped %s" % self.path)
        else:
            rval = Alert.Alert("Checksum mismatch: %s" % rsp)
            CrawlConfig.log("hashverify generated 'Checksum mismatch' " +
                            "alert on %s" % self.path)
        return rval
Ejemplo n.º 3
0
def tcc_report(bitfile, cosinfo=None, path=None, log=True, store=True):
    """
    The bitfile appears to not have the right number of copies. We're going to
    write its information out to a report for manual followup.
    """
    cosinfo = get_cos_info()
    fmt = "%7s %8s %8s %s"
    hdr = fmt % ("COS", "Ccopies", "Fcopies", "Filepath")

    # Compute the bitfile's path
    if path is None:
        bfp = get_bitfile_path(bitfile['BFID'])
    else:
        bfp = path
    rpt = fmt % (bitfile['BFATTR_COS_ID'],
                 str(cosinfo[bitfile['BFATTR_COS_ID']]),
                 str(bitfile['SC_COUNT']),
                 bfp)
    if log:
        CrawlConfig.log(rpt)
    if store:
        try:
            tcc_report._f.write(rpt + "\n")
            tcc_report._f.flush()
        except AttributeError:
            cfg = CrawlConfig.get_config()
            rptfname = cfg.get(sectname(), 'report_file')
            tcc_report._f = open(rptfname, 'a')
            tcc_report._f.write(hdr)
            tcc_report._f.write(rpt + "\n")
            tcc_report._f.flush()
    return rpt
Ejemplo n.º 4
0
def simplug(plugin, args):
    """
    Common plugin simulator. May be used by the interactive tools to simulate
    running the associated plugin.
    """
    p = optparse.OptionParser()
    p.add_option('-d', '--debug',
                 action='store_true', default=False, dest='debug',
                 help='run the debugger')
    p.add_option('-i', '--iterations',
                 action='store', default=1, dest='iterations', type='int',
                 help='how many iterations to run')
    (o, a) = p.parse_args(args)

    if o.debug:
        pdb.set_trace()

    cfg = CrawlConfig.get_config()
    CrawlConfig.log("starting %s simplug, just got config" % plugin)
    sys.path.append(cfg.get('crawler', 'plugin-dir'))
    modname = cfg.get(plugin, 'module')
    try:
        P = __import__(modname)
    except ImportError:
        H = __import__('hpssic.plugins.' + modname)
        P = getattr(H.plugins, modname)
    P.main(cfg)
    if 1 < o.iterations:
        for count in range(o.iterations-1):
            stime = cfg.get_time(plugin, 'frequency')
            time.sleep(stime)
            P.main(cfg)
Ejemplo n.º 5
0
def tcc_report(bitfile, cosinfo=None, path=None, log=True, store=True):
    """
    The bitfile appears to not have the right number of copies. We're going to
    write its information out to a report for manual followup.
    """
    cosinfo = get_cos_info()
    fmt = "%7s %8s %8s %s"
    hdr = fmt % ("COS", "Ccopies", "Fcopies", "Filepath")

    # Compute the bitfile's path
    if path is None:
        bfp = get_bitfile_path(bitfile['BFID'])
    else:
        bfp = path
    rpt = fmt % (bitfile['BFATTR_COS_ID'],
                 str(cosinfo[bitfile['BFATTR_COS_ID']]),
                 str(bitfile['SC_COUNT']), bfp)
    if log:
        CrawlConfig.log(rpt)
    if store:
        try:
            tcc_report._f.write(rpt + "\n")
            tcc_report._f.flush()
        except AttributeError:
            cfg = CrawlConfig.get_config()
            rptfname = cfg.get(sectname(), 'report_file')
            tcc_report._f = open(rptfname, 'a')
            tcc_report._f.write(hdr)
            tcc_report._f.write(rpt + "\n")
            tcc_report._f.flush()
    return rpt
Ejemplo n.º 6
0
def crl_cfgdump(argv):
    """cfgdump - load a config file and dump its contents

    usage: crawl cfgdump -c <filename> [--to stdout|log] [--logpath <path>]
    """
    p = optparse.OptionParser()
    p.add_option('-c', '--cfg',
                 action='store', default='', dest='config',
                 help='config file name')
    p.add_option('-d', '--debug',
                 action='store_true', default=False, dest='debug',
                 help='run the debugger')
    p.add_option('-t', '--to',
                 action='store', default='', dest='target',
                 help='specify where to send the output')
    p.add_option('-l', '--logpath',
                 action='store', default='', dest='logpath',
                 help='specify where to send the output')
    (o, a) = p.parse_args(argv)

    if o.debug:
        pdb.set_trace()

    if o.target == '':
        o.target = 'stdout'

    cfg = CrawlConfig.get_config(o.config)
    dumpstr = cfg.dump()

    if o.target == 'stdout':
        print dumpstr
    elif o.target == 'log':
        log = CrawlConfig.log(logpath=o.logpath, cfg=cfg)
        for line in dumpstr.split("\n"):
            CrawlConfig.log(line)
Ejemplo n.º 7
0
def crl_log(argv):
    """log - write a message to the indicated log file

    usage: crawl log --log <filename> <message>
    """
    p = optparse.OptionParser()
    p.add_option('-d',
                 '--debug',
                 action='store_true',
                 default=False,
                 dest='debug',
                 help='run the debugger')
    p.add_option('-l',
                 '--log',
                 action='store',
                 default=None,
                 dest='logfile',
                 help='specify the log file')
    (o, a) = p.parse_args(argv)

    if o.debug:
        pdb.set_trace()

    cfg = CrawlConfig.get_config()
    CrawlConfig.log(" ".join(a), logpath=o.logfile, cfg=cfg)
Ejemplo n.º 8
0
    def verify(self, h):
        """
        Attempt to verify the current file.
        """
        CrawlConfig.log("hsi(%d) attempting to verify %s" % (h.pid(),
                                                             self.path))
        rsp = h.hashverify(self.path)

        if "TIMEOUT" in rsp or "ERROR" in rsp:
            rval = "skipped"
            self.set('fails', self.fails + 1)
            CrawlConfig.log("hashverify transfer incomplete on %s -- skipping"
                            % self.path)
            h.quit()
        elif "%s: (md5) OK" % self.path in rsp:
            rval = "matched"
            CrawlConfig.log("hashverify matched on %s" % self.path)
        elif "no valid checksum found" in rsp:
            if self.addable(self.cos):
                rval = self.add_to_sample(h)
            else:
                self.set('checksum', 0)
                rval = "skipped"
                CrawlConfig.log("hashverify skipped %s" % self.path)
        else:
            rval = Alert.Alert("Checksum mismatch: %s" % rsp)
            CrawlConfig.log("hashverify generated 'Checksum mismatch' " +
                            "alert on %s" % self.path)
        return rval
Ejemplo n.º 9
0
def record_checked_ids(cfg, low, high, correct, error):
    """
    Save checked NSOBJECT ids in the HPSSIC database.

    If we check a range and get no hits (i.e., no NSOBJECT ids exist in the
    range), we'll store

       (<time>, <low-id>, <high-id>, 0, 0)

    If we get a hit with the right copy count, we store it by itself as

       (<time>, <hit-id>, <hit-id>, 1, 0)

    If we get a hit with the wrong copy count, we store it by itself as

       (<time>, <hit-id>, <hit-id>, 0, 1)
    """
    tabname = cfg.get(sectname(), 'table_name')

    result = dbschem.make_table(tabname)
    ts = int(time.time())
    CrawlConfig.log("recording checked ids %d to %d at %d" % (low, high, ts))
    db = CrawlDBI.DBI(dbtype="crawler")
    db.insert(table=tabname,
              fields=[
                  'check_time', 'low_nsobj_id', 'high_nsobj_id', 'correct',
                  'error'
              ],
              data=[(ts, low, high, correct, error)])
    db.close()
Ejemplo n.º 10
0
def check_path(path, verbose=False, plugin=True, xof=True):
    """
    If plugin is True, we want to log and store, which tcc_report does by
    default so we leave those flags alone.

    If plugin is False, we're interactive and we want to write any report to
    stdout. However, we only make a report if 1) verbose is True, or 2) the
    counts don't match.
    """
    cosinfo = get_cos_info()
    nsobj = path_nsobject(path)
    try:
        bfl = get_bitfile_set(int(nsobj), 1)
    except U.HpssicError as e:
        if plugin:
            CrawlConfig.log(e.value)
            return
        elif xof:
            raise SystemExit(e.value)
        else:
            raise U.HpssicError(e.value)

    bf = U.pop0(bfl)
    sc_count = int(bf['SC_COUNT'])
    cos_count = int(cosinfo[bf['BFATTR_COS_ID']])

    if plugin and sc_count != cos_count:
        tcc_report(bf, path=path)
    elif not plugin and (verbose or sc_count != cos_count):
        print(tcc_report(bf, path=path, log=False, store=False))
Ejemplo n.º 11
0
def record_checked_ids(cfg, low, high, correct, error):
    """
    Save checked NSOBJECT ids in the HPSSIC database.

    If we check a range and get no hits (i.e., no NSOBJECT ids exist in the
    range), we'll store

       (<time>, <low-id>, <high-id>, 0, 0)

    If we get a hit with the right copy count, we store it by itself as

       (<time>, <hit-id>, <hit-id>, 1, 0)

    If we get a hit with the wrong copy count, we store it by itself as

       (<time>, <hit-id>, <hit-id>, 0, 1)
    """
    tabname = cfg.get(sectname(), 'table_name')

    result = dbschem.make_table(tabname)
    ts = int(time.time())
    CrawlConfig.log("recording checked ids %d to %d at %d" % (low, high, ts))
    db = CrawlDBI.DBI(dbtype="crawler")
    db.insert(table=tabname,
              fields=['check_time',
                      'low_nsobj_id',
                      'high_nsobj_id',
                      'correct',
                      'error'],
              data=[(ts, low, high, correct, error)])
    db.close()
Ejemplo n.º 12
0
def crl_start(argv):
    """start - if the crawler is not already running as a daemon, start it

    usage: crawl start

    default config file: crawl.cfg, or
                         $CRAWL_CONF, or
                         -c <filename> on command line
    default log file:    /var/log/crawl.log, or
                         $CRAWL_LOG, or
                         -l <filename> on command line
    """
    p = optparse.OptionParser()
    p.add_option('-c', '--cfg',
                 action='store', default='', dest='config',
                 help='config file name')
    p.add_option('-d', '--debug',
                 action='store_true', default=False, dest='debug',
                 help='run the debugger')
    p.add_option('-l', '--log',
                 action='store', default='', dest='logfile',
                 help='specify the log file')
    p.add_option('-C', '--context',
                 action='store', default='', dest='context',
                 help="context of crawler ('TEST' or 'PROD')")
    (o, a) = p.parse_args(argv)

    if o.debug:
        pdb.set_trace()

    cfg = CrawlConfig.get_config(o.config)

    #
    # Initialize the configuration
    #
    if o.context != '':
        cfg.set('crawler', 'context', o.context)
    try:
        exitpath = cfg.get('crawler', 'exitpath')
    except CrawlConfig.NoOptionError as e:
        print("No exit path is specified in the configuration")
        sys.exit(1)

    vstr = "HPSS Integrity Crawler version %s" % version.__version__
    log = CrawlConfig.log(vstr, logpath=o.logfile, cfg=cfg)
    pfpath = make_pidfile(os.getpid(),
                          cfg.get('crawler', 'context'),
                          exitpath,
                          just_check=True)
    crawler = CrawlDaemon(pfpath,
                          stdout="crawler.stdout",
                          stderr="crawler.stderr",
                          logger=log,
                          workdir='.')
    CrawlConfig.log('crl_start: calling crawler.start()')
    crawler.start()
    pass
Ejemplo n.º 13
0
    def load_recheck_list(cls, how_many):
        """
        Look to see whether any of the already checksummed items in the
        database have a last check time over the threshold for rechecking. If
        so, we'll shove some of them to the front of the list based on the
        configuration.
        """
        cfg = CrawlConfig.add_config()
        r_fraction = float(cfg.get_d('cv', 'recheck_fraction', '0.0'))
        r_age = cfg.get_time('cv', 'recheck_age', 365 * 24 * 3600)
        threshold = int(time.time() - r_age)
        CrawlConfig.log("threshold = %s (%d)", U.ymdhms(threshold), threshold)
        if r_fraction == 0.0:
            return []

        limit = round(r_fraction * how_many)

        db = CrawlDBI.DBI(dbtype='crawler')
        kw = {
            'table':
            'checkables',
            'fields': [
                'rowid', 'path', 'type', 'cos', 'cart', 'ttypes', 'checksum',
                'last_check', 'fails', 'reported'
            ],
            'where':
            'checksum <> 0 and last_check < %d' % threshold,
            'orderby':
            'last_check',
            'limit':
            limit
        }

        rows = db.select(**kw)
        db.close()

        rval = []
        for row in rows:
            tmp = list(row)
            new = Checkable(rowid=tmp.pop(0),
                            path=tmp.pop(0),
                            type=tmp.pop(0),
                            cos=tmp.pop(0),
                            cart=tmp.pop(0),
                            ttypes=tmp.pop(0),
                            checksum=tmp.pop(0),
                            last_check=tmp.pop(0),
                            fails=tmp.pop(0),
                            reported=tmp.pop(0),
                            in_db=True,
                            dirty=False)
            rval.append(new)
        return rval
Ejemplo n.º 14
0
 def populate_cart(self, h):
     """
     Fill in the cart field
     """
     rsp = h.lsP(self.path)
     tmp = Checkable.fdparse(rsp.split("\n")[1])
     try:
         self.cart = tmp.cart
     except AttributeError:
         self.cart = ''
         CrawlConfig.log("%s <- Checkable.fdparse('%s')" %
                         (tmp, rsp.split("\n")[1]))
Ejemplo n.º 15
0
 def populate_cart(self, h):
     """
     Fill in the cart field
     """
     rsp = h.lsP(self.path)
     tmp = Checkable.fdparse(rsp.split("\n")[1])
     try:
         self.cart = tmp.cart
     except AttributeError:
         self.cart = ''
         CrawlConfig.log("%s <- Checkable.fdparse('%s')" %
                         (tmp, rsp.split("\n")[1]))
Ejemplo n.º 16
0
    def load_recheck_list(cls, how_many):
        """
        Look to see whether any of the already checksummed items in the
        database have a last check time over the threshold for rechecking. If
        so, we'll shove some of them to the front of the list based on the
        configuration.
        """
        cfg = CrawlConfig.add_config()
        r_fraction = float(cfg.get_d('cv', 'recheck_fraction', '0.0'))
        r_age = cfg.get_time('cv', 'recheck_age', 365*24*3600)
        threshold = int(time.time() - r_age)
        CrawlConfig.log("threshold = %s (%d)", U.ymdhms(threshold), threshold)
        if r_fraction == 0.0:
            return []

        limit = round(r_fraction * how_many)

        db = CrawlDBI.DBI(dbtype='crawler')
        kw = {'table': 'checkables',
              'fields': ['rowid',
                         'path',
                         'type',
                         'cos',
                         'cart',
                         'ttypes',
                         'checksum',
                         'last_check',
                         'fails',
                         'reported'],
              'where': 'checksum <> 0 and last_check < %d' % threshold,
              'orderby': 'last_check',
              'limit': limit}

        rows = db.select(**kw)
        db.close()

        rval = []
        for row in rows:
            tmp = list(row)
            new = Checkable(rowid=tmp.pop(0),
                            path=tmp.pop(0),
                            type=tmp.pop(0),
                            cos=tmp.pop(0),
                            cart=tmp.pop(0),
                            ttypes=tmp.pop(0),
                            checksum=tmp.pop(0),
                            last_check=tmp.pop(0),
                            fails=tmp.pop(0),
                            reported=tmp.pop(0),
                            in_db=True,
                            dirty=False)
            rval.append(new)
        return rval
Ejemplo n.º 17
0
 def fire(self):
     """
     Run the plugin.
     """
     if self.firable:
         CrawlConfig.log("%s: firing" % self.name)
         # sys.modules[self.modname].main(self.cfg)
         errors = self.plugin.main(self.cfg)
         self.last_fired = time.time()
         crawl_sublib.record_history(self.name, self.last_fired, errors)
     elif self.cfg.getboolean('crawler', 'verbose'):
         CrawlConfig.log("%s: not firable" % self.name)
         self.last_fired = time.time()
Ejemplo n.º 18
0
    def add_to_sample(self, hsi, already_hashed=False):
        """
        Add the current Checkable to the sample. If already_hashed is True,
        this is a file for which a checksum has already been computed. We just
        need to record that fact by setting its checksum member to 1 and
        updating the sample count.

        If already_hashed is False, we need to carry out the following steps:

         1) run hashcreate on the file
         2) set checksum to non-zero to record that we have a checksum
         3) update the sample count in the Dimension object
        """
        if not already_hashed:
            CrawlConfig.log("starting hashcreate on %s", self.path)
            rsp = hsi.hashcreate(self.path)
            if "TIMEOUT" in rsp or "ERROR" in rsp:
                CrawlConfig.log("hashcreate transfer failed on %s", self.path)
                hsi.quit()
                self.set('fails', self.fails + 1)
                return "skipped"
            elif "Access denied" in rsp:
                CrawlConfig.log("hashcreate failed with 'access denied' on %s",
                                self.path)
                hsi.quit()
                return "access denied"
            else:
                CrawlConfig.log("completed hashcreate on %s", self.path)

        if self.checksum == 0:
            for dn in self.dim:
                cat = getattr(self, dn)
                self.dim[dn].addone(cat)
            self.set('checksum', 1)
            return "checksummed"
Ejemplo n.º 19
0
    def add_to_sample(self, hsi, already_hashed=False):
        """
        Add the current Checkable to the sample. If already_hashed is True,
        this is a file for which a checksum has already been computed. We just
        need to record that fact by setting its checksum member to 1 and
        updating the sample count.

        If already_hashed is False, we need to carry out the following steps:

         1) run hashcreate on the file
         2) set checksum to non-zero to record that we have a checksum
         3) update the sample count in the Dimension object
        """
        if not already_hashed:
            CrawlConfig.log("starting hashcreate on %s", self.path)
            rsp = hsi.hashcreate(self.path)
            if "TIMEOUT" in rsp or "ERROR" in rsp:
                CrawlConfig.log("hashcreate transfer failed on %s", self.path)
                hsi.quit()
                self.set('fails', self.fails + 1)
                return "skipped"
            elif "Access denied" in rsp:
                CrawlConfig.log("hashcreate failed with 'access denied' on %s",
                                self.path)
                hsi.quit()
                return "access denied"
            else:
                CrawlConfig.log("completed hashcreate on %s", self.path)

        if self.checksum == 0:
            for dn in self.dim:
                cat = getattr(self, dn)
                self.dim[dn].addone(cat)
            self.set('checksum', 1)
            return "checksummed"
Ejemplo n.º 20
0
def crl_fire(argv):
    """fire - run a plugin

    usage: crawl fire --cfg cfgname --logpath logfname --plugin plugname
    """
    p = optparse.OptionParser()
    p.add_option('-c',
                 '--cfg',
                 action='store',
                 default='',
                 dest='config',
                 help='config file name')
    p.add_option('-d',
                 '--debug',
                 action='store_true',
                 default=False,
                 dest='debug',
                 help='run the debugger')
    p.add_option('-l',
                 '--logpath',
                 action='store',
                 default='',
                 dest='logpath',
                 help='specify where to send the output')
    p.add_option('-p',
                 '--plugin',
                 action='store',
                 default='',
                 dest='plugname',
                 help='which plugin to fire')
    (o, a) = p.parse_args(argv)

    if o.debug:
        pdb.set_trace()

    cfg = CrawlConfig.get_config(o.config)
    CrawlConfig.log(logpath=o.logpath, cfg=cfg)

    if o.plugname == '':
        print("'-p <plugin-name>' is required")
    elif not cfg.has_section(o.plugname):
        print("No plugin named '%s' found in configuration" % o.plugname)
    else:
        plugdir = cfg.get('crawler', 'plugin-dir')
        sys.path.append(plugdir)
        __import__(o.plugname)
        CrawlConfig.log('firing %s', o.plugname)
        sys.modules[o.plugname].main(cfg)
Ejemplo n.º 21
0
def crl_cfgdump(argv):
    """cfgdump - load a config file and dump its contents

    usage: crawl cfgdump -c <filename> [--to stdout|log] [--logpath <path>]
    """
    p = optparse.OptionParser()
    p.add_option('-c',
                 '--cfg',
                 action='store',
                 default='',
                 dest='config',
                 help='config file name')
    p.add_option('-d',
                 '--debug',
                 action='store_true',
                 default=False,
                 dest='debug',
                 help='run the debugger')
    p.add_option('-t',
                 '--to',
                 action='store',
                 default='',
                 dest='target',
                 help='specify where to send the output')
    p.add_option('-l',
                 '--logpath',
                 action='store',
                 default='',
                 dest='logpath',
                 help='specify where to send the output')
    (o, a) = p.parse_args(argv)

    if o.debug:
        pdb.set_trace()

    if o.target == '':
        o.target = 'stdout'

    cfg = CrawlConfig.get_config(o.config)
    dumpstr = cfg.dump()

    if o.target == 'stdout':
        print dumpstr
    elif o.target == 'log':
        log = CrawlConfig.log(logpath=o.logpath, cfg=cfg)
        for line in dumpstr.split("\n"):
            CrawlConfig.log(line)
Ejemplo n.º 22
0
def highest_nsobject_id():
    """
    Cache and return the largest NSOBJECT id in the DB2 database. The variables
    highest_nsobject_id._max_obj_id and highest_nsobject_id._when are local to
    this function but do not lose their values between invocations.
    """
    if (not hasattr(highest_nsobject_id, '_max_obj_id')
            or not hasattr(highest_nsobject_id, '_when')
            or 60 < time.time() - highest_nsobject_id._when):
        highest_nsobject_id._max_obj_id = max_nsobj_id()
        highest_nsobject_id._when = time.time()
        CrawlConfig.log("max object id = %d at %s" %
                        (highest_nsobject_id._max_obj_id,
                         util.ymdhms(highest_nsobject_id._when)))

    rval = highest_nsobject_id._max_obj_id
    return rval
Ejemplo n.º 23
0
def get_last_rpt_time(db):
    """
    Retrieve the last report time from the report table. If the table does not
    exist before make_table ('Created' in result), the table is empty so we
    just return 0 to indicate no last report time.
    """
    result = dbschem.make_table("report")
    if "Created" in result:
        rval = 0
    else:
        rows = db.select(table='report', fields=['max(report_time)'])
        (rval) = rows[0][0]
        if rval is None:
            rval = 0

    CrawlConfig.log("time of last report: %d" % rval)
    return rval
Ejemplo n.º 24
0
def highest_nsobject_id():
    """
    Cache and return the largest NSOBJECT id in the DB2 database. The variables
    highest_nsobject_id._max_obj_id and highest_nsobject_id._when are local to
    this function but do not lose their values between invocations.
    """
    if (not hasattr(highest_nsobject_id, '_max_obj_id') or
            not hasattr(highest_nsobject_id, '_when') or
            60 < time.time() - highest_nsobject_id._when):
        highest_nsobject_id._max_obj_id = max_nsobj_id()
        highest_nsobject_id._when = time.time()
        CrawlConfig.log("max object id = %d at %s" %
                        (highest_nsobject_id._max_obj_id,
                         util.ymdhms(highest_nsobject_id._when)))

    rval = highest_nsobject_id._max_obj_id
    return rval
Ejemplo n.º 25
0
def get_last_rpt_time(db):
    """
    Retrieve the last report time from the report table. If the table does not
    exist before make_table ('Created' in result), the table is empty so we
    just return 0 to indicate no last report time.
    """
    result = dbschem.make_table("report")
    if "Created" in result:
        rval = 0
    else:
        rows = db.select(table='report',
                         fields=['max(report_time)'])
        (rval) = rows[0][0]
        if rval is None:
            rval = 0

    CrawlConfig.log("time of last report: %d" % rval)
    return rval
Ejemplo n.º 26
0
    def hashcreate(self, pathnames):
        """
        Argument pathnames should reference one or more files. It may be a
        string containing one or more space separated file paths, or a list of
        one or more file paths. If it has type unicode, it will be encoded to
        'ascii' before being treated as a string.
        """
        if type(pathnames) == str:
            pathlist = pathnames.split()
        elif type(pathnames) == list:
            pathlist = pathnames
        elif type(pathnames) == unicode:
            pathlist = pathnames.encode('ascii', 'ignore').split()
        else:
            raise HSIerror("%s: Invalid argument (%s: '%s')" %
                           (util.my_name(), type(pathnames), pathnames))
        rval = ""
        for path in pathlist:
            if self.reset_atime:
                prev_time = self.access_time(path)

            if self.hash_algorithm is None:
                cmd = "hashcreate %s" % path
            else:
                cmd = "hashcreate -H %s %s" % (self.hash_algorithm, path)
            self.xobj.sendline(cmd)
            which = self.xobj.expect([self.prompt, pexpect.TIMEOUT] +
                                     self.hsierrs)
            while which == 1 and 1 < len(self.xobj.before):
                CrawlConfig.log("got a timeout, continuing because before " +
                                "is not empty and does not contain an error")
                rval += self.xobj.before
                which = self.xobj.expect([self.prompt, pexpect.TIMEOUT] +
                                         self.hsierrs)
            rval += self.xobj.before
            if 1 == which:
                rval += " TIMEOUT"
            elif 0 != which:
                rval += " ERROR"

            if self.reset_atime:
                self.touch(path, when=prev_time)

        return rval
Ejemplo n.º 27
0
def crl_log(argv):
    """log - write a message to the indicated log file

    usage: crawl log --log <filename> <message>
    """
    p = optparse.OptionParser()
    p.add_option('-d', '--debug',
                 action='store_true', default=False, dest='debug',
                 help='run the debugger')
    p.add_option('-l', '--log',
                 action='store', default=None, dest='logfile',
                 help='specify the log file')
    (o, a) = p.parse_args(argv)

    if o.debug:
        pdb.set_trace()

    cfg = CrawlConfig.get_config()
    CrawlConfig.log(" ".join(a), logpath=o.logfile, cfg=cfg)
Ejemplo n.º 28
0
    def __init__(self, name=None, cfg=None):
        """
        Configuration data is read and copied into the object by method
        init_cfg_data(), called by both the constructor and reload().
        init_cfg_data() reverses the order of cfg and name in its argument list
        from the constructor so name can have a default and reload() doesn't
        have to pass it.

        last_fired is initialized by the constructor but not by reload(). So if
        the plugin is updated by a reconfigure, it won't lose its last fire
        time but will stay on the same schedule.
        """
        assert(name is not None)
        assert(cfg is not None)
        self.cfg = cfg
        l = CrawlConfig.log(cfg=cfg, close=True)
        CrawlConfig.log("%s: Initializing plugin data" % name)
        self.init_cfg_data(name, cfg)
        self.last_fired = time.time() - self.frequency - 1
        super(CrawlPlugin, self).__init__()
Ejemplo n.º 29
0
 def addable(self):
     """
     Determine which Dimensions want this item added. Note that we want this
     routine to be general across dimensions so we don't want it to assume
     anything about the dimension it's checking (like that it's named 'cos'
     for example). That why calls to this pass in cos rather than looking at
     the value in the object.
     """
     for dn in self.dim:
         cval = getattr(self, dn)
         if self.dim[dn].vote(cval) is False:
             CrawlConfig.log("%s votes against %s -- skipping" %
                             (dn, self.path))
             return False
     randval = random.random()
     if self.probability < randval:
         CrawlConfig.log("random votes against %s -- skipping (%g < %g)" %
                         (self.path, self.probability, randval))
         return False
     return True
Ejemplo n.º 30
0
 def addable(self):
     """
     Determine which Dimensions want this item added. Note that we want this
     routine to be general across dimensions so we don't want it to assume
     anything about the dimension it's checking (like that it's named 'cos'
     for example). That why calls to this pass in cos rather than looking at
     the value in the object.
     """
     for dn in self.dim:
         cval = getattr(self, dn)
         if self.dim[dn].vote(cval) is False:
             CrawlConfig.log("%s votes against %s -- skipping" %
                             (dn, self.path))
             return False
     randval = random.random()
     if self.probability < randval:
         CrawlConfig.log("random votes against %s -- skipping (%g < %g)" %
                         (self.path, self.probability, randval))
         return False
     return True
Ejemplo n.º 31
0
def mpra_fetch_recent(type):
    """
    Retrieve and return the most recent record reported so we don't report the
    same record repeatedly
    """
    db = CrawlDBI.DBI(dbtype="crawler")
    if not db.table_exists(table='mpra'):
        CrawlConfig.log("Fetch from not existent mpra table -- return 0")
        return 0

    rows = db.select(table='mpra',
                     fields=['scan_time, end_time'],
                     where='type = ?',
                     data=(type,))
    last_end_time = -1
    max_scan_time = 0
    for r in rows:
        if max_scan_time < r[0]:
            max_scan_time = r[0]
            last_end_time = r[1]

    if last_end_time < 0:
        CrawlConfig.log("No '%s' value in mpra -- returning 0" % type)
        return 0
    else:
        CrawlConfig.log("Fetch '%s' from mpra table -- return %d" %
                        (type, last_end_time))
        return last_end_time
Ejemplo n.º 32
0
def simplug(plugin, args):
    """
    Common plugin simulator. May be used by the interactive tools to simulate
    running the associated plugin.
    """
    p = optparse.OptionParser()
    p.add_option('-d',
                 '--debug',
                 action='store_true',
                 default=False,
                 dest='debug',
                 help='run the debugger')
    p.add_option('-i',
                 '--iterations',
                 action='store',
                 default=1,
                 dest='iterations',
                 type='int',
                 help='how many iterations to run')
    (o, a) = p.parse_args(args)

    if o.debug:
        pdb.set_trace()

    cfg = CrawlConfig.get_config()
    CrawlConfig.log("starting %s simplug, just got config" % plugin)
    sys.path.append(cfg.get('crawler', 'plugin-dir'))
    modname = cfg.get(plugin, 'module')
    try:
        P = __import__(modname)
    except ImportError:
        H = __import__('hpssic.plugins.' + modname)
        P = getattr(H.plugins, modname)
    P.main(cfg)
    if 1 < o.iterations:
        for count in range(o.iterations - 1):
            stime = cfg.get_time(plugin, 'frequency')
            time.sleep(stime)
            P.main(cfg)
Ejemplo n.º 33
0
def maybe_update_hsi():
    """
    If the hsi wrapper script has changed, grab and edit a fresh copy
    """
    l = util.which_all('hsi')
    trg = l[0]
    tc = util.contents(trg).split("\n")
    tv = util.grep('^BINARYVERSION=', tc)

    s = [x for x in l if 'sources/hpss' in x]
    src = s[0]
    sc = util.contents(src).split("\n")
    sv = util.grep('^BINARYVERSION=', sc)

    if tv[0] != sv[0]:
        z = util.grep("${EXECUTABLE}", sc, regex=False, index=True)
        sc[z[0]] = "exec " + sc[z[0]]
        try:
            f = open(trg, 'w')
            f.writelines("\n".join(sc) + "\n")
            f.close()
        except IOError as e:
            CrawlConfig.log(MSG.hsi_wrap_ood)
Ejemplo n.º 34
0
def crl_fire(argv):
    """fire - run a plugin

    usage: crawl fire --cfg cfgname --logpath logfname --plugin plugname
    """
    p = optparse.OptionParser()
    p.add_option('-c', '--cfg',
                 action='store', default='', dest='config',
                 help='config file name')
    p.add_option('-d', '--debug',
                 action='store_true', default=False, dest='debug',
                 help='run the debugger')
    p.add_option('-l', '--logpath',
                 action='store', default='', dest='logpath',
                 help='specify where to send the output')
    p.add_option('-p', '--plugin',
                 action='store', default='', dest='plugname',
                 help='which plugin to fire')
    (o, a) = p.parse_args(argv)

    if o.debug:
        pdb.set_trace()

    cfg = CrawlConfig.get_config(o.config)
    CrawlConfig.log(logpath=o.logpath, cfg=cfg)

    if o.plugname == '':
        print("'-p <plugin-name>' is required")
    elif not cfg.has_section(o.plugname):
        print("No plugin named '%s' found in configuration" % o.plugname)
    else:
        plugdir = cfg.get('crawler', 'plugin-dir')
        sys.path.append(plugdir)
        __import__(o.plugname)
        CrawlConfig.log('firing %s', o.plugname)
        sys.modules[o.plugname].main(cfg)
Ejemplo n.º 35
0
 def quit(self):
     """
     All done here. Let's bail.
     """
     try:
         pid = self.xobj.pid
         self.xobj.sendline("quit")
         self.xobj.expect([pexpect.EOF, pexpect.TIMEOUT])
         self.xobj.close()
         CrawlConfig.log("Closing hsi process %d" % pid)
     except OSError as e:
         tbstr = tb.format_exc()
         CrawlConfig.log("Ignoring OSError '%s'" % str(e))
         for line in tbstr.split("\n"):
             CrawlConfig.log(line)
Ejemplo n.º 36
0
    def dispatch(self):
        """
        Figure out where we're supposed to send this alert and send it.
        Possible destinations are the log file, one or more e-mail addresses,
        and/or a shell program.

        It's also possible for a 'use' option to show up in the alerts section.
        In this case, we're being redirected to another section, also 'use' can
        also point to the current alerts section. There's no reason to ever do
        this, but it could happen so we want to handle it in a reasonable way.

        That's why we sort the config options in the while statement below --
        to make 'use' get handled last, so any other options in the section
        will get handled. Once we process 'use', anything not yet processed in
        the current section is ignored.
        """
        if self.cfg is not None:
            cfg = self.cfg
        else:
            cfg = CrawlConfig.add_config()
        if self.caller != '':
            section = cfg.get(self.caller, 'alerts')
        else:
            section = 'alerts'

        done = False
        while not done:
            for opt in sorted(cfg.options(section)):
                if opt == 'log':
                    # write to log
                    fmt = cfg.get(section, 'log')
                    CrawlConfig.log(fmt, self.msg)
                    done = True

                elif opt == 'shell':
                    # run the program
                    cmd = cfg.get(section, 'shell')
                    if '%s' in cmd:
                        cmdline = cmd % (self.msg)
                    else:
                        cmdline = cmd
                    os.system(cmdline)
                    CrawlConfig.log("ran: '%s'" % (cmdline))
                    done = True

                elif opt == 'email':
                    CrawlMail.send(cfg=cfg,
                                   to="%s.email" % section,
                                   subj="HPSS Integrity Crawler ALERT",
                                   msg=self.msg)
                    done = True

                elif opt == 'use':
                    # delegate to another section
                    done = True
                    new_section = cfg.get(section, 'use')

                    # if it's the same section, ignore the 'use', but we don't
                    # want to break the rule that all options after a 'use' are
                    # ignored. So we set done to True to terminate the while
                    # loop and break unconditionally at the end of this clause
                    # to get out of the for loop
                    if new_section != section:
                        section = new_section
                        done = False
                    break
Ejemplo n.º 37
0
    def run(self):
        """
        This routine runs in the background as a daemon. Here's where
        we fire off plug-ins as appropriate.
        """
        cfgname = ''
        self.cfg = CrawlConfig.get_config(cfgname)
        self.pidfile = "%s/%d" % (self.piddir, os.getpid())
        exit_file = self.cfg.get('crawler', 'exitpath')
        ctx = self.cfg.get('crawler', 'context')
        clean_defunct_pidfiles(ctx)
        make_pidfile(os.getpid(), ctx, exit_file)
        atexit.register(self.delpid)

        keep_going = True
        plugin_d = {}
        while keep_going:
            try:
                pluglstr = self.cfg.get('crawler', 'plugins')
                pluglist = [x.strip() for x in pluglstr.split(',')]
                for s in pluglist:
                    self.dlog('crawl: CONFIG: [%s]' % s)
                    for o in self.cfg.options(s):
                        self.dlog('crawl: CONFIG: %s: %s' %
                                  (o, self.cfg.get(s, o)))
                    if s == 'crawler':
                        continue
                    elif s in plugin_d.keys():
                        CrawlConfig.log("reloading plugin %s" % s)
                        plugin_d[s].reload(self.cfg)
                    else:
                        CrawlConfig.log("initial load of plugin %s" % s)
                        plugin_d[s] = CrawlPlugin.CrawlPlugin(name=s,
                                                              cfg=self.cfg)

                # remove any plugins that are not in the new configuration
                for p in plugin_d.keys():
                    if p not in self.cfg.sections():
                        CrawlConfig.log("unloading obsolete plugin %s" % p)
                        del plugin_d[p]

                heartbeat = self.cfg.get_time('crawler', 'heartbeat', 10)
                while keep_going:
                    #
                    # Fire any plugins that are due
                    #
                    if not self.cfg.quiet_time(time.time()):
                        hb_msg = "crawl: heartbeat..."
                        if self.fire_plugins(plugin_d):
                            keep_going = False
                    else:
                        hb_msg = "crawl: heartbeat... [quiet]"

                    # CrawlConfig.log("issue the heartbeat")
                    #
                    # Issue the heartbeat if it's time
                    #
                    if 0 == (int(time.time()) % heartbeat):
                        # self.dlog(hb_msg)
                        CrawlConfig.log(hb_msg)

                    # CrawlConfig.log("check for config changes")
                    #
                    # If config file has changed, reload it.
                    # cached config object and breaking out of the inner loop.
                    #
                    if self.cfg.changed():
                        cfgname = self.cfg.get('crawler', 'filename')
                        self.cfg = CrawlConfig.get_config(reset=True)
                        break

                    # CrawlConfig.log("check for exit signal")
                    #
                    # Check for the exit signal
                    #
                    if util.conditional_rm(exit_file):
                        self.dlog('crawl: shutting down')
                        keep_going = False

                    # CrawlConfig.log("sleep")
                    #
                    # We cycle once per second so we can detect if the user
                    # asks us to stop or if the config file changes and needs
                    # to be reloaded
                    #
                    time.sleep(1.0)

            except:
                # if we get an exception, write the traceback to the log file
                tbstr = tb.format_exc()
                for line in tbstr.split('\n'):
                    self.dlog("crawl: '%s'" % line)
                keep_going = False
Ejemplo n.º 38
0
    def run(self):
        """
        This routine runs in the background as a daemon. Here's where
        we fire off plug-ins as appropriate.
        """
        cfgname = ''
        self.cfg = CrawlConfig.get_config(cfgname)
        self.pidfile = "%s/%d" % (self.piddir, os.getpid())
        exit_file = self.cfg.get('crawler', 'exitpath')
        ctx = self.cfg.get('crawler', 'context')
        clean_defunct_pidfiles(ctx)
        make_pidfile(os.getpid(), ctx, exit_file)
        atexit.register(self.delpid)

        keep_going = True
        plugin_d = {}
        while keep_going:
            try:
                pluglstr = self.cfg.get('crawler', 'plugins')
                pluglist = [x.strip() for x in pluglstr.split(',')]
                for s in pluglist:
                    self.dlog('crawl: CONFIG: [%s]' % s)
                    for o in self.cfg.options(s):
                        self.dlog('crawl: CONFIG: %s: %s' %
                                  (o, self.cfg.get(s, o)))
                    if s == 'crawler':
                        continue
                    elif s in plugin_d.keys():
                        CrawlConfig.log("reloading plugin %s" % s)
                        plugin_d[s].reload(self.cfg)
                    else:
                        CrawlConfig.log("initial load of plugin %s" % s)
                        plugin_d[s] = CrawlPlugin.CrawlPlugin(name=s,
                                                              cfg=self.cfg)

                # remove any plugins that are not in the new configuration
                for p in plugin_d.keys():
                    if p not in self.cfg.sections():
                        CrawlConfig.log("unloading obsolete plugin %s" % p)
                        del plugin_d[p]

                heartbeat = self.cfg.get_time('crawler', 'heartbeat', 10)
                while keep_going:
                    #
                    # Fire any plugins that are due
                    #
                    if not self.cfg.quiet_time(time.time()):
                        hb_msg = "crawl: heartbeat..."
                        if self.fire_plugins(plugin_d):
                            keep_going = False
                    else:
                        hb_msg = "crawl: heartbeat... [quiet]"

                    # CrawlConfig.log("issue the heartbeat")
                    #
                    # Issue the heartbeat if it's time
                    #
                    if 0 == (int(time.time()) % heartbeat):
                        # self.dlog(hb_msg)
                        CrawlConfig.log(hb_msg)

                    # CrawlConfig.log("check for config changes")
                    #
                    # If config file has changed, reload it.
                    # cached config object and breaking out of the inner loop.
                    #
                    if self.cfg.changed():
                        cfgname = self.cfg.get('crawler', 'filename')
                        self.cfg = CrawlConfig.get_config(reset=True)
                        break

                    # CrawlConfig.log("check for exit signal")
                    #
                    # Check for the exit signal
                    #
                    if util.conditional_rm(exit_file):
                        self.dlog('crawl: shutting down')
                        keep_going = False

                    # CrawlConfig.log("sleep")
                    #
                    # We cycle once per second so we can detect if the user
                    # asks us to stop or if the config file changes and needs
                    # to be reloaded
                    #
                    time.sleep(1.0)

            except:
                # if we get an exception, write the traceback to the log file
                tbstr = tb.format_exc()
                for line in tbstr.split('\n'):
                    self.dlog("crawl: '%s'" % line)
                keep_going = False
Ejemplo n.º 39
0
def crl_start(argv):
    """start - if the crawler is not already running as a daemon, start it

    usage: crawl start

    default config file: crawl.cfg, or
                         $CRAWL_CONF, or
                         -c <filename> on command line
    default log file:    /var/log/crawl.log, or
                         $CRAWL_LOG, or
                         -l <filename> on command line
    """
    p = optparse.OptionParser()
    p.add_option('-c',
                 '--cfg',
                 action='store',
                 default='',
                 dest='config',
                 help='config file name')
    p.add_option('-d',
                 '--debug',
                 action='store_true',
                 default=False,
                 dest='debug',
                 help='run the debugger')
    p.add_option('-l',
                 '--log',
                 action='store',
                 default='',
                 dest='logfile',
                 help='specify the log file')
    p.add_option('-C',
                 '--context',
                 action='store',
                 default='',
                 dest='context',
                 help="context of crawler ('TEST' or 'PROD')")
    (o, a) = p.parse_args(argv)

    if o.debug:
        pdb.set_trace()

    cfg = CrawlConfig.get_config(o.config)

    #
    # Initialize the configuration
    #
    if o.context != '':
        cfg.set('crawler', 'context', o.context)
    try:
        exitpath = cfg.get('crawler', 'exitpath')
    except CrawlConfig.NoOptionError as e:
        print("No exit path is specified in the configuration")
        sys.exit(1)

    vstr = "HPSS Integrity Crawler version %s" % version.__version__
    log = CrawlConfig.log(vstr, logpath=o.logfile, cfg=cfg)
    pfpath = make_pidfile(os.getpid(),
                          cfg.get('crawler', 'context'),
                          exitpath,
                          just_check=True)
    crawler = CrawlDaemon(pfpath,
                          stdout="crawler.stdout",
                          stderr="crawler.stderr",
                          logger=log,
                          workdir='.')
    CrawlConfig.log('crl_start: calling crawler.start()')
    crawler.start()
    pass
Ejemplo n.º 40
0
def send(to='', subj='', msg='', sender='', cfg=None):
    """
    Send e-mail as indicated

    sender precedence: argument, cfg, default value; if type(sender) is not
    str, throw the exception
    """
    if type(to) != str:
        raise util.HpssicError(MSG.invalid_recip_list)
    if sender is not None and type(sender) != str:
        raise util.HpssicError(MSG.invalid_sender_S % str(sender))
    if type(msg) != str:
        raise util.HpssicError(MSG.invalid_msg_body)
    if subj is not None and type(subj) != str:
        raise util.HpssicError(MSG.invalid_subject_S % str(subj))

    # Prepare a message object based on *msg*
    if msg:
        payload = email.mime.text.MIMEText(msg)
    else:
        payload = email.mime.text.MIMEText(MSG.empty_message)

    # Set the recipient address(es) based on *to*
    default_recip = '*****@*****.**'
    if to == '':
        if cfg is None:
            raise util.HpssicError(MSG.no_recip_list)
        else:
            (section, option) = ('crawler', 'notify-e-mail')
            addrs = cfg.get(section, option)
    elif ',' in to or '@' in to:
        addrs = to
    elif '.' in to:
        if cfg is None:
            addrs = default_recip
        else:
            (section, option) = to.split('.')
            addrs = cfg.get_d(section, option, default_recip)

    addrlist = [x.strip() for x in addrs.split(',')]
    payload['To'] = addrs

    # Set the subject based on *subj*
    if subj:
        payload['Subject'] = subj
    else:
        payload['Subject'] = MSG.default_mail_subject

    # Set the from address
    default_sender = 'hpssic@%s' % util.hostname(long=True)
    if sender is None or sender == '':
        if cfg is not None:
            sender = cfg.get_d('crawler', 'from_address', default_sender)
        else:
            sender = default_sender
    elif '@' not in sender:
        raise util.HpssicError(MSG.invalid_sender_S % str(sender))

    payload['From'] = sender

    # Send the message
    s = smtplib.SMTP('localhost')
    s.sendmail(sender, addrlist, payload.as_string())
    s.quit()

    # Log it
    CrawlConfig.log("sent mail to %s", addrs)
Ejemplo n.º 41
0
    def dispatch(self):
        """
        Figure out where we're supposed to send this alert and send it.
        Possible destinations are the log file, one or more e-mail addresses,
        and/or a shell program.

        It's also possible for a 'use' option to show up in the alerts section.
        In this case, we're being redirected to another section, also 'use' can
        also point to the current alerts section. There's no reason to ever do
        this, but it could happen so we want to handle it in a reasonable way.

        That's why we sort the config options in the while statement below --
        to make 'use' get handled last, so any other options in the section
        will get handled. Once we process 'use', anything not yet processed in
        the current section is ignored.
        """
        if self.cfg is not None:
            cfg = self.cfg
        else:
            cfg = CrawlConfig.add_config()
        if self.caller != '':
            section = cfg.get(self.caller, 'alerts')
        else:
            section = 'alerts'

        done = False
        while not done:
            for opt in sorted(cfg.options(section)):
                if opt == 'log':
                    # write to log
                    fmt = cfg.get(section, 'log')
                    CrawlConfig.log(fmt, self.msg)
                    done = True

                elif opt == 'shell':
                    # run the program
                    cmd = cfg.get(section, 'shell')
                    if '%s' in cmd:
                        cmdline = cmd % (self.msg)
                    else:
                        cmdline = cmd
                    os.system(cmdline)
                    CrawlConfig.log("ran: '%s'" % (cmdline))
                    done = True

                elif opt == 'email':
                    CrawlMail.send(cfg=cfg,
                                   to="%s.email" % section,
                                   subj="HPSS Integrity Crawler ALERT",
                                   msg=self.msg)
                    done = True

                elif opt == 'use':
                    # delegate to another section
                    done = True
                    new_section = cfg.get(section, 'use')

                    # if it's the same section, ignore the 'use', but we don't
                    # want to break the rule that all options after a 'use' are
                    # ignored. So we set done to True to terminate the while
                    # loop and break unconditionally at the end of this clause
                    # to get out of the for loop
                    if new_section != section:
                        section = new_section
                        done = False
                    break
Ejemplo n.º 42
0
    def check(self):
        """
        For a directory:
         - get a list of its contents if possible,
         - create a Checkable object for each item and persist it to the
           database
         - return the list of Checkables found in the directory
        For a file:
         - if it already has a hash, add it to the sample if not already
           and verify it
         - if it does not have a hash, decide whether to add it or not

        The value of probability [0.0 .. 1.0] indicates the likelihood with
        which we should check files.

        potential outcomes            return
         read a directory             list of Checkable objects
         file checksum fail           Alert
         invalid Checkable type       raise StandardError
         access denied                "access denied"
         verified file checksum       "matched"
         checksum a file              "checksummed"
         skipped a file               "skipped"
         hpss unavailable             "unavailable"

        Here we examine a population member, count it as a member of the
        population, decide whether to add it to the sample, and if so, count it
        as a sample member.

        First, we have to make all the decisions and update the object
        accordingly.

        Then, we persist the object to the database.
        """
        # fire up hsi
        # self.probability = probability
        rval = []
        cfg = CrawlConfig.get_config()
        # hsi_timeout = int(cfg.get_d('crawler', 'hsi_timeout', 300))
        try:
            # h = hpss.HSI(timeout=hsi_timeout, verbose=True)
            h = hpss.HSI(verbose=True)
            CrawlConfig.log("started hsi with pid %d" % h.pid())
        except hpss.HSIerror as e:
            return "unavailable"

        if self.type == 'd':
            rsp = h.lsP(self.path)
            if "Access denied" in rsp:
                rval = "access denied"
            else:
                for line in rsp.split("\n"):
                    new = Checkable.fdparse(line)
                    if new is not None:
                        rval.append(new)
                        new.load()
                        new.persist()
                        # returning list of items found in the directory
        elif self.type == 'f':
            if self.cart is None:
                self.populate_cart(h)
            if self.checksum == 0:
                if self.has_hash(h):
                    self.add_to_sample(h, already_hashed=True)
                    rval = self.verify(h)
                    # returning "matched", "checksummed", "skipped", or Alert()
                elif self.addable():
                    rval = self.add_to_sample(h)
                    # returning "access denied" or "checksummed"
                else:
                    rval = "skipped"
            else:
                rval = self.verify(h)
                # returning "matched", "checksummed", "skipped", or Alert()
        else:
            raise StandardError("Invalid Checkable type: %s" % self.type)

        if (3 < self.fails) and (0 == self.reported):
            self.fail_report(h.before())
            rval = "skipped"

        h.quit()

        self.set('last_check', time.time())
        CrawlConfig.log(
            "Persisting checkable '%s' with %s = %f, %s = %d" %
            (self.path, 'last_check', self.last_check, 'fails', self.fails))
        self.persist()
        return rval
Ejemplo n.º 43
0
    def get_list(cls, how_many=-1, prob=0.1, rootlist=[]):
        """
        Return the current list of Checkables from the database.
        """
        if how_many < 0:
            cfg = CrawlConfig.add_config()
            how_many = int(cfg.get_d('cv', 'operations', '30'))

        rval = Checkable.load_priority_list()
        if how_many <= len(rval):
            return rval

        rval.extend(Checkable.load_recheck_list(how_many))
        if how_many <= len(rval):
            return rval

        db = CrawlDBI.DBI(dbtype='crawler')
        kw = {
            'table':
            'checkables',
            'fields': [
                'rowid', 'path', 'type', 'cos', 'cart', 'ttypes', 'checksum',
                'last_check', 'fails', 'reported'
            ],
            'orderby':
            'last_check'
        }
        if 0 < how_many:
            kw['limit'] = how_many

        rows = db.select(**kw)

        # check whether any roots from rootlist are missing and if so, add them
        # to the table
        reselect = False
        pathlist = [x[1] for x in rows]
        for root in rootlist:
            if root not in pathlist:
                nr = Checkable(path=root, type='d')
                nr.load()
                nr.persist()
                reselect = True

        if reselect:
            rows = db.select(**kw)

        for row in rows:
            tmp = list(row)
            new = Checkable(rowid=tmp.pop(0),
                            path=tmp.pop(0),
                            type=tmp.pop(0),
                            cos=tmp.pop(0),
                            cart=tmp.pop(0),
                            ttypes=tmp.pop(0),
                            checksum=tmp.pop(0),
                            last_check=tmp.pop(0),
                            fails=tmp.pop(0),
                            reported=tmp.pop(0),
                            probability=prob,
                            in_db=True,
                            dirty=False)
            if new not in rval:
                rval.append(new)
            if how_many <= len(rval):
                break

        db.close()
        CrawlConfig.log("returning %d items" % len(rval))
        return rval
Ejemplo n.º 44
0
    def get_list(cls, how_many=-1, prob=0.1, rootlist=[]):
        """
        Return the current list of Checkables from the database.
        """
        if how_many < 0:
            cfg = CrawlConfig.add_config()
            how_many = int(cfg.get_d('cv', 'operations', '30'))

        rval = Checkable.load_priority_list()
        if how_many <= len(rval):
            return rval

        rval.extend(Checkable.load_recheck_list(how_many))
        if how_many <= len(rval):
            return rval

        db = CrawlDBI.DBI(dbtype='crawler')
        kw = {'table': 'checkables',
              'fields': ['rowid',
                         'path',
                         'type',
                         'cos',
                         'cart',
                         'ttypes',
                         'checksum',
                         'last_check',
                         'fails',
                         'reported'],
              'orderby': 'last_check'}
        if 0 < how_many:
            kw['limit'] = how_many

        rows = db.select(**kw)

        # check whether any roots from rootlist are missing and if so, add them
        # to the table
        reselect = False
        pathlist = [x[1] for x in rows]
        for root in rootlist:
            if root not in pathlist:
                nr = Checkable(path=root, type='d')
                nr.load()
                nr.persist()
                reselect = True

        if reselect:
            rows = db.select(**kw)

        for row in rows:
            tmp = list(row)
            new = Checkable(rowid=tmp.pop(0),
                            path=tmp.pop(0),
                            type=tmp.pop(0),
                            cos=tmp.pop(0),
                            cart=tmp.pop(0),
                            ttypes=tmp.pop(0),
                            checksum=tmp.pop(0),
                            last_check=tmp.pop(0),
                            fails=tmp.pop(0),
                            reported=tmp.pop(0),
                            probability=prob,
                            in_db=True,
                            dirty=False)
            if new not in rval:
                rval.append(new)
            if how_many <= len(rval):
                break

        db.close()
        CrawlConfig.log("returning %d items" % len(rval))
        return rval
Ejemplo n.º 45
0
    def check(self):
        """
        For a directory:
         - get a list of its contents if possible,
         - create a Checkable object for each item and persist it to the
           database
         - return the list of Checkables found in the directory
        For a file:
         - if it already has a hash, add it to the sample if not already
           and verify it
         - if it does not have a hash, decide whether to add it or not

        The value of probability [0.0 .. 1.0] indicates the likelihood with
        which we should check files.

        potential outcomes            return
         read a directory             list of Checkable objects
         file checksum fail           Alert
         invalid Checkable type       raise StandardError
         access denied                "access denied"
         verified file checksum       "matched"
         checksum a file              "checksummed"
         skipped a file               "skipped"
         hpss unavailable             "unavailable"

        Here we examine a population member, count it as a member of the
        population, decide whether to add it to the sample, and if so, count it
        as a sample member.

        First, we have to make all the decisions and update the object
        accordingly.

        Then, we persist the object to the database.
        """
        # fire up hsi
        # self.probability = probability
        rval = []
        cfg = CrawlConfig.get_config()
        # hsi_timeout = int(cfg.get_d('crawler', 'hsi_timeout', 300))
        try:
            # h = hpss.HSI(timeout=hsi_timeout, verbose=True)
            h = hpss.HSI(verbose=True)
            CrawlConfig.log("started hsi with pid %d" % h.pid())
        except hpss.HSIerror as e:
            return "unavailable"

        if self.type == 'd':
            rsp = h.lsP(self.path)
            if "Access denied" in rsp:
                rval = "access denied"
            else:
                for line in rsp.split("\n"):
                    new = Checkable.fdparse(line)
                    if new is not None:
                        rval.append(new)
                        new.load()
                        new.persist()
                        # returning list of items found in the directory
        elif self.type == 'f':
            if self.cart is None:
                self.populate_cart(h)
            if self.checksum == 0:
                if self.has_hash(h):
                    self.add_to_sample(h, already_hashed=True)
                    rval = self.verify(h)
                    # returning "matched", "checksummed", "skipped", or Alert()
                elif self.addable():
                    rval = self.add_to_sample(h)
                    # returning "access denied" or "checksummed"
                else:
                    rval = "skipped"
            else:
                rval = self.verify(h)
                # returning "matched", "checksummed", "skipped", or Alert()
        else:
            raise StandardError("Invalid Checkable type: %s" % self.type)

        if (3 < self.fails) and (0 == self.reported):
            self.fail_report(h.before())
            rval = "skipped"

        h.quit()

        self.set('last_check', time.time())
        CrawlConfig.log("Persisting checkable '%s' with %s = %f, %s = %d" %
                        (self.path,
                         'last_check', self.last_check,
                         'fails', self.fails))
        self.persist()
        return rval