def main():
    global args
    logging.debug("Running main() of file: %r." % __file__)

    (inconn,incursor) = differ_library.db_conn_from_config_path(args.config)

    #Fetch work queue
    if args.tails_only:
        #This query is as in check_tarball_is_sequence_end.py, r574.
        query = """
SELECT
  storage.*
FROM
  diskprint.slice AS slice,
  diskprint.storage AS storage
WHERE
  storage.slicehash = slice.slicehash AND
  (slice.osetid, slice.appetid, slice.sliceid) NOT IN (
    SELECT
      s1.osetid,
      s1.appetid,
      s1.slicepredecessorid
    FROM
      diskprint.slice AS s1,
      diskprint.slice AS s2
    WHERE
      s1.osetid = s2.osetid AND
      s1.appetid = s2.appetid AND
      s1.slicepredecessorid = s2.sliceid
  )
;
        """
    else:
        query = """
SELECT
  pq.*
FROM
  diskprint.processqueue AS pq,
  diskprint.slice AS s
WHERE
  s.slicehash = pq.slicehash
ORDER BY
  sliceid DESC
;
        """
    incursor.execute(query)
    logging.info("Diskprint table Query: %s" % query)
    inrows = [row for row in incursor]
    any_found = 0
    for inrow in inrows:
        any_found += 1
        inrow_location = inrow["location"]
        logging.debug("PROCESS QUEUE LOCATION %s" % inrow_location)
        print(inrow_location)

    if any_found == 0:
        logging.info("No diskprints to process.")

    #Cleanup
    inconn.close()
Ejemplo n.º 2
0
def main():
    filemetadata_out = open("filemetadata.sql", "w")
    md5_out = open("md5.sql", "w")

    (conn, cursor) = differ_library.db_conn_from_config_path(args.config)

    #Get slice hash
    cursor.execute("SELECT slicehash FROM diskprint.storage WHERE location = %s", (args.slice_path,))
    inrows = [row for row in cursor]
    if len(inrows) != 1:
        logging.error("Could not find diskprint from tarball path: %r." % args.slice_path)
        sys.exit(1)
    slicehash = inrows[0]["slicehash"]

    def process_fi(fi):
        """
        Produce SQL records for every allocated file.
        (This is an inline function so the value of 'slicehash' is in scope.)
        """
        #Only allocated, regular files
        if not fi.allocated():
            return
        if fi.name_type() != "r":
            return

        #Build SQL templates
        md5_insert_template = "insert into diskprint.MD5 values ('%(keyhash)s','%(keyhash_md5)s');\n"
        filemetadata_insert_template = "insert into diskprint.filemetadata (keyhash, slicehash, path, filename, extension, bytes, mtime, ctime) values ('%(keyhash)s','%(slicehash)s','%(path)s','%(filename)s','%(extension)s',%(bytes)d,'%(mtime)s','%(ctime)s');\n"

        #Build SQL values as substitution dictionary
        d = dict()
        d["keyhash"] = fi.sha1()
        d["keyhash_md5"] = fi.md5()
        d["slicehash"] = slicehash
        d["path"] = fi.filename()
        d["filename"] = os.path.basename(fi.filename())
        d["extension"] = os.path.splitext(fi.filename())[1]
        d["bytes"] = fi.filesize()
        d["mtime"] = fi.mtime()
        d["ctime"] = fi.crtime() #TODO What does this table actually mean by ctime?  Change, or create?

        #Output
        filemetadata_out.write(filemetadata_insert_template % d)
        md5_out.write(md5_insert_template % d)

    #Begin loop through XML
    dfxml.read_dfxml(xmlfile=open(args.fiwalk_xml, "rb"), callback=process_fi)
def main():
    global args

    (inconn,incursor) = differ_library.db_conn_from_config_path(args.config)

    incursor.execute("""
SELECT
  COUNT(*) AS tally
FROM
  diskprint.storage
WHERE
  location = %s
;
    """, (args.slice_path,))
    inrows = [row for row in incursor]
    if len(inrows) != 1:
        raise Exception("Could not find tarball path in diskprint.storage table: %r." % args.slice_path)

    incursor.execute("""
SELECT
  storage.*
FROM
  diskprint.slice AS slice,
  diskprint.storage AS storage
WHERE
  storage.location = %s AND
  storage.slicehash = slice.slicehash AND
  (slice.osetid, slice.appetid, slice.sliceid) NOT IN (
    SELECT
      s1.osetid,
      s1.appetid,
      s1.slicepredecessorid
    FROM
      diskprint.slice AS s1,
      diskprint.slice AS s2
    WHERE
      s1.osetid = s2.osetid AND
      s1.appetid = s2.appetid AND
      s1.slicepredecessorid = s2.sliceid
  )
;
    """, (args.slice_path,))
    inrows = [row for row in incursor]
    if len(inrows) != 1:
        logging.error("Did not retrieve a single vetted tarball path; got %r instead.  Concluding this is not a sequence end." % len(inrows))
        sys.exit(1)
def main():
    global args
    (outconn, outcursor) = differ_library.db_conn_from_config_path(args.config)

    inconn = sqlite3.connect(args.inputsqlite)
    inconn.row_factory = sqlite3.Row
    incursor = inconn.cursor()

    #Get translation dictionary for cell actions
    outcursor.execute("SELECT * FROM diskprint.cell;")
    action_text_to_id = dict()
    for outrow in outcursor:
        action_text_to_id[outrow["actiontype"]] = outrow["actionid"]
    logging.debug("Cell action translation dictionary:\n\t%r" % action_text_to_id)
    
    #Populate Hive table and get new ID's assigned by auto-incrementor
    #The insert-then-commit operation should guarantee Postgres is doling out unique IDs.
    in_to_out_hiveid = dict()
    for inrow in incursor.execute("SELECT * FROM hive;"):
        indict = dict()
        outdict = dict()
        for k in inrow.keys():
            indict[k] = inrow[k]
        for k in ["hivepath", "appetid", "osetid"]:
            outdict[k] = inrow[k]
        #Inline function for maybe-once repetition
        def fetch(od):
            outcursor.execute("""
              SELECT
                *
              FROM
                diskprint.hive
              WHERE
                hivepath = %s AND appetid = %s AND osetid = %s
              ;
            """, (od["hivepath"], od["appetid"], od["osetid"]))
            return [row for row in outcursor]
        #checkrows should ultimately have just one record in it from the database, from which we get the translated ID of the hive sequence (hiveid identifies sequences)
        checkrows = fetch(outdict)
        if len(checkrows) > 1:
            logging.error("Could not retrieve unique record that was just inserted; something about the database state is incorrect.  Expected to get 1 record, got %d." % len(checkrows))
            if len(in_to_out_hiveid) > 0:
                logging.info("You may want to issue this query to undo changes made by this script, after looking at the table:\n\tDELETE FROM diskprint.hive WHERE hiveid IN (%s);" % ",".join(map(str, in_to_out_hiveid.values())))
            exit(1)
        elif len(checkrows) == 0:
            insert_db_postgres(outcursor, "hive", outdict)
            outconn.commit()
            checkrows = fetch(outdict)
        in_to_out_hiveid[indict["hiveid"]] = checkrows[0]["hiveid"]

    logging.debug("Hive ID translation dictionary:\n\t%r" % in_to_out_hiveid)
    hive_ids_str = ",".join(map(str, in_to_out_hiveid.values()))
    logging.info("If at any point after this the script fails, you should investigate and probably delete Postgres records with these rollback queries:\n\tDELETE FROM diskprint.hive WHERE hiveid IN (%s);\n\tDELETE FROM diskprint.regdelta WHERE hiveid IN (%s);" % (hive_ids_str, hive_ids_str))

    #Check for previous export of the mutation records
    outcursor.execute("""SELECT COUNT(*) AS tally FROM diskprint.regdelta WHERE hiveid IN (%s);""" % hive_ids_str)
    tallyoutrows = [row for row in outcursor]
    if len(tallyoutrows) != 1:
        logging.error("Not sure how, but a SELECT COUNT from Postgres didn't return 1 row; it returned %d.  Quitting." % len(tallyoutrows))
        exit(1)
    tally_postgres = tallyoutrows[0]["tally"]
    incursor.execute("""SELECT COUNT(*) AS tally FROM regdelta;""")
    tallyinrows = [row for row in incursor]
    if len(tallyinrows) != 1:
        logging.error("Not sure how, but a SELECT COUNT from SQLite didn't return 1 row; it returned %d.  Quitting." % len(tallyinrows))
        exit(1)
    tally_sqlite = tallyinrows[0]["tally"]
    if tally_postgres > 0:
        logging.error("Some records for the hives in this sequential analysis are already in Postgres.  Here is how the count compares to the SQLite that was about to be imported:\n\tPostgres\t%d\n\tSQLite\t%d" % (tally_postgres,tally_sqlite))
        logging.info("Given you invoked this script, you probably want the export to run to completion.  Clear away the records by issuing the DELETE queries above in the Postgres server, and then you can re-run this script and it should complete.") 
        exit(1)

    #Export the mutation records
    bailout = False
    for inrow in incursor.execute("SELECT * FROM regdelta;"):
        #Build output dictionary (we're about to tweak it)
        outdict = dict()
        for k in inrow.keys():
            outdict[k] = inrow[k]
        #Translate records
        outdict["hiveid"] = in_to_out_hiveid.get(inrow["hiveid"])
        outdict["appetid"] = inrow["appetid"]
        outdict["osetid"] = inrow["osetid"]
        outdict["sliceid"] = inrow["sliceid"]
        if outdict["hiveid"] is None:
            logging.error("Failed to translate a hiveid: %r was not found." % inrow["hiveid"])
            logging.info("Hive ID translation dictionary:\n\t%r" % in_to_out_hiveid)
            bailout = True
        outdict["iskeybefore"] = inrow["iskeybefore"] == 1
        outdict["iskeyafter"] = inrow["iskeyafter"] == 1
        outdict["cellaction"] = action_text_to_id.get(inrow["cellaction"])
        if outdict["cellaction"] is None:
            logging.error("Failed to translate a cell action: %r not in table 'cell'." % inrow["cellaction"])
            bailout = True
        #Fail out if something didn't translate
        if bailout:
            logging.warning("You may want to purge the regdelta records from Postgres (this script was in the process of inserting them).  See the above DELETE queries.")
            exit(1)

        #Ship it!
        insert_db_postgres(outcursor, "regdelta", outdict)
    outconn.commit()
def main():
    global args

    if not os.path.isfile(args.slice_path):
        raise Exception("Invoked on non-existent path '%s'." % args.slice_path)

    loglvl = logging.DEBUG if args.debug else logging.INFO
    logging.basicConfig(
      format='%(asctime)s %(levelname)s: %(message)s',
      datefmt='%Y-%m-%dT%H:%M:%SZ',
      level=loglvl
    )

    (conn, cursor) = differ_library.db_conn_from_config_path(args.config)

    cursor.execute("""
        SELECT
          slice.sliceid,
          slice.osetid,
          slice.appetid,
          storage.location
        FROM
          diskprint.storage AS storage, diskprint.slice AS slice
        WHERE
          slice.slicehash = storage.slicehash AND
          location = %s;
    """, (args.slice_path,))
    returned_rows = [row for row in cursor]
    if len(returned_rows) != 1:
        raise Exception("Error: unexpected data: Expected only one sliceid to match '%s', got %d." % (args.slice_path, len(returned_rows)))
    slice_id = returned_rows[0]["sliceid"]
    osetid = returned_rows[0]["osetid"]
    appetid = returned_rows[0]["appetid"]

    #Build chain of slice_id's
    ids_reversed = [slice_id]
    earliest_id = slice_id
    while True:
        cursor.execute("""
            SELECT
              slicepredecessorid
            FROM
              diskprint.slice AS slice
            WHERE
              osetid = %s AND
              appetid = %s AND
              sliceid = %s;
        """, (osetid, appetid, earliest_id))
        returned_rows = [row for row in cursor]
        if len(returned_rows) == 0:
            raise Exception("Missing data: sliceid %d's parent not present in slice table." % earliest_id)
        elif len(returned_rows) > 1:
            raise Exception("Ambiguous history: A disk slice can't have two immediately preceding slices.\n\tsliceid: %r." % earliest_id)

        earliest_id = returned_rows[0]["slicepredecessorid"]
        if earliest_id is None:
            #Done with this sequence.
            break
        else:
            if earliest_id in ids_reversed:
                raise Exception("Encountered preceding-slice loop: sliceid %d already in sliceid list." % earliest_id)
            ids_reversed.append(earliest_id)
    ids_reversed.reverse()
    ids = ids_reversed

    #Look up paths of slice_id's
    #Fail out here if for some reason there are none:
    # "SQL doesn't allow an empty list in the IN operator, so your code should guard against empty tuples" <http://initd.org/psycopg/docs/usage.html#python-types-adaptation>.
    if len(ids) == 0:
        raise Exception("Somehow the ids list emptied; cannot proceed.")
    cursor.execute("""
        SELECT
          slice.sliceid, storage.location
        FROM
          diskprint.storage AS storage, diskprint.slice AS slice
        WHERE
          slice.slicehash = storage.slicehash AND
          slice.osetid = %s AND
          slice.appetid = %s AND
          sliceid in %s;
    """, (osetid, appetid, tuple(ids)) )

    #Build map of slice id to file system path
    id_to_path = dict()
    for row in cursor:
        id_to_path[int(row["sliceid"])] = row["location"]

    #Build list of file system paths
    paths = []
    for id in ids:
        try:
            paths.append(id_to_path[id])
        except:
            sys.stderr.write("Debug: Was looking up id=%d\n" % id)
            raise
        
    #Output paths
    for path in paths:
        print(path)