def main():
    conn = sqlite3.connect(args.out_db)
    conn.row_factory = sqlite3.Row
    rcursor = conn.cursor()
    wcursor = conn.cursor()

    _logger.debug("Attaching Registry state database %r." % args.rss_db)
    wcursor.execute("ATTACH DATABASE '%s' AS rss;" % args.rss_db)

    #Try to reduce seek times
    wcursor.execute("PRAGMA cache_size = 786432;") #3/4 GiB
    wcursor.execute("PRAGMA rss.cache_size = 1048576;") #1 GiB

    _logger.debug("Populating all-parent-path table...")
    wcursor.execute(SQL_CREATE_TEMP_CELL_PARENT)
    rcursor.execute("""\
SELECT
  filename,
  cellname,
  basename,
  type
FROM
  rss.cell_analysis AS c,
  rss.hive_analysis AS h
WHERE
  c.hive_id = h.hive_id
;""")
    for row in rcursor:
        cellpath = row["cellname"]
        if args.normalize:
            cellpath = normalizer.normalize_path(row["filename"], cellpath)

        if row["type"] == "root":
            #Root cell has no parent.
            parentpath = None
            basename = cellpath[1:] #Trim leading backslash
        else:
            basename = row["basename"]
            if basename is None:
                raise ValueError("Existence assumption violated: Null basename for cellpath %r, hive %r." % (row["cellname"], row["filename"]))
            #Trim trailing backslash as well
            parentpath = cellpath[ 0 : -len(basename)-1 ]

        wcursor.execute("INSERT INTO temp_cell_parent(cellpath, basename, parentpath) VALUES (?,?,?);", (cellpath, basename, parentpath))
    _logger.debug("Done populating all-parent-path table.")

    _logger.debug("Sorting and counting cell parent metadata from temporary table...")
    wcursor.execute(SQL_CREATE_CELL_PARENT)

    _logger.debug("Done sorting and counting cell parent metadata from temporary table.")

    _logger.debug("Creating all-parent-path index...")
    wcursor.execute(SQL_CREATE_CELL_PARENT_INDEX)
    _logger.debug("Done creating all-parent-path index.")

    conn.commit()
            def _generate_cellpaths():
                nodecursor.execute("""\
SELECT
  filename,
  cellname
FROM
  hive_analysis AS h,
  cell_analysis AS c
WHERE
  h.hive_id = c.hive_id
;""")
                for noderow in nodecursor:
                    if args.normalize:
                        cellpath = normalizer.normalize_path(noderow["filename"], noderow["cellname"])
                    else:
                        cellpath = noderow["cellname"]
                    yield (cellpath,)
    def _generate_paths():
        rcursor.execute("""\
SELECT
  filename,
  cellname
FROM
  rss.hive_analysis AS ha,
  rss.cell_analysis AS ca
WHERE
  ha.hive_id = ca.hive_id
;""")
        for row in rcursor:
            if args.normalize:
                query_path = normalizer.normalize_path(row["filename"], row["cellname"])
            else:
                query_path = row["cellname"]
            yield (query_path,)
            def _generate_cellpaths():
                nodecursor.execute("""\
SELECT
  filename,
  cellname
FROM
  hive_analysis AS h,
  cell_analysis AS c
WHERE
  h.hive_id = c.hive_id
;""")
                for noderow in nodecursor:
                    if args.normalize:
                        cellpath = normalizer.normalize_path(
                            noderow["filename"], noderow["cellname"])
                    else:
                        cellpath = noderow["cellname"]
                    yield (cellpath, )
def main():
    path_xp32_0 =  args.dwf_node_results_dir + "/234-1-234-1-150/format_registry_single_state.sh/registry_single_state.db"
    path_xp32_1 =  args.dwf_node_results_dir + "/11331-2-11331-2-90/format_registry_single_state.sh/registry_single_state.db"
    path_vista32 = args.dwf_node_results_dir + "/8504-1-8504-1-90/format_registry_single_state.sh/registry_single_state.db"
    path_vista64 = args.dwf_node_results_dir + "/8504-2-8504-2-90/format_registry_single_state.sh/registry_single_state.db"
    path_732 =     args.dwf_node_results_dir + "/9480-1-9480-1-150/format_registry_single_state.sh/registry_single_state.db"
    path_764 =     args.dwf_node_results_dir + "/9480-2-9480-2-150/format_registry_single_state.sh/registry_single_state.db"
    path_832 =     args.dwf_node_results_dir + "/14694-1-14694-1-60/format_registry_single_state.sh/registry_single_state.db"
    path_864 =     args.dwf_node_results_dir + "/14694-2-14694-2-50/format_registry_single_state.sh/registry_single_state.db"

    _logger.debug("Inspecting path: %r." % path_xp32_0)
    assert os.path.exists(path_xp32_0)
    assert os.path.exists(path_xp32_1)
    _logger.debug("Inspecting path: %r." % path_vista32)
    assert os.path.exists(path_vista32)
    assert os.path.exists(path_vista64)
    assert os.path.exists(path_732)
    assert os.path.exists(path_764)
    assert os.path.exists(path_832)
    assert os.path.exists(path_864)

    conns = collections.OrderedDict()
    conns["XP (1)"] = sqlite3.connect(path_xp32_0)
    conns["XP (2)"] = sqlite3.connect(path_xp32_1)
    conns["Vista-32"] = sqlite3.connect(path_vista32)
    conns["Vista-64"] = sqlite3.connect(path_vista64)
    conns["7-32"] = sqlite3.connect(path_732)
    conns["7-64"] = sqlite3.connect(path_764)
    conns["8-32"] = sqlite3.connect(path_832)
    conns["8-64"] = sqlite3.connect(path_864)

    cursors = dict()
    cellsets = dict()

    root_cell_metadata = []

    for osname in conns:
        logging.debug("Ingesting " + repr(osname) + ".")
        #conns[osname].isolation_level = "EXCLUSIVE"
        conns[osname].row_factory = sqlite3.Row
        cursors[osname] = conns[osname].cursor()
        cellsets[osname] = set()
        cursors[osname].execute("SELECT ca.cellname, ca.type, ha.filename FROM cell_analysis AS ca, hive_analysis AS ha WHERE ca.hive_id = ha.hive_id;")
        prefixes_nagged = set()
        for row in cursors[osname]:
            #Record the root cell paths for a normalizing aid report
            if row["type"] == "root":
                root_cell_metadata.append((osname, row["filename"], row["cellname"]))

            #Record the cell paths for overlap measurements
            if args.normalize:
                cellpath = normalizer.normalize_path(row["filename"], row["cellname"])

                #Record failures to normalize
                if cellpath == row["cellname"]:
                    prefix = normalizer.extract_prefix(cellpath)
                    if prefix not in prefixes_nagged:
                        prefixes_nagged.add(prefix)

                        logging.warning("This prefix failed to normalize: %r." % prefix)
                        logging.info("The image being ingested is: %r." % osname)
                        logging.info("It came from the hive at this path: %r." % row["filename"])
            else:
                cellpath = row["cellname"]
            cellsets[osname].add(cellpath)
        logging.debug("Ingested %d cell paths." % len(cellsets[osname]))

    win7_overlaps = set()
    thetable = dict()
    for osnamea in conns:
        for osnameb in conns:
            if osnamea == osnameb:
                thetally = len(cellsets[osnamea])
            else:
                the_overlaps = set.intersection(cellsets[osnamea], cellsets[osnameb])
                if set([osnamea,osnameb]) == set(["7-32","7-64"]):
                    win7_overlaps.update(the_overlaps)
                thetally = len(the_overlaps)
            thetable[(osnamea, osnameb)] = thetally
            thetable[(osnameb, osnamea)] = thetally

    _logger.debug("The table: %r." % thetable)

    osnames = [osname for osname in conns]

    with open(args.out_pickle, "wb") as out_fh:
        pickler = pickle.Pickler(out_fh)
        out_dict = {
          "osnames": osnames,
          "conns": {key:conns[key] for key in osnames},
          "thetable": thetable
        }
        pickler.dump(out_dict)
def main():
    path_xp32_0 = args.dwf_node_results_dir + "/234-1-234-1-150/format_registry_single_state.sh/registry_single_state.db"
    path_xp32_1 = args.dwf_node_results_dir + "/11331-2-11331-2-90/format_registry_single_state.sh/registry_single_state.db"
    path_vista32 = args.dwf_node_results_dir + "/8504-1-8504-1-90/format_registry_single_state.sh/registry_single_state.db"
    path_vista64 = args.dwf_node_results_dir + "/8504-2-8504-2-90/format_registry_single_state.sh/registry_single_state.db"
    path_732 = args.dwf_node_results_dir + "/9480-1-9480-1-150/format_registry_single_state.sh/registry_single_state.db"
    path_764 = args.dwf_node_results_dir + "/9480-2-9480-2-150/format_registry_single_state.sh/registry_single_state.db"
    path_832 = args.dwf_node_results_dir + "/14694-1-14694-1-60/format_registry_single_state.sh/registry_single_state.db"
    path_864 = args.dwf_node_results_dir + "/14694-2-14694-2-50/format_registry_single_state.sh/registry_single_state.db"

    _logger.debug("Inspecting path: %r." % path_xp32_0)
    assert os.path.exists(path_xp32_0)
    assert os.path.exists(path_xp32_1)
    _logger.debug("Inspecting path: %r." % path_vista32)
    assert os.path.exists(path_vista32)
    assert os.path.exists(path_vista64)
    assert os.path.exists(path_732)
    assert os.path.exists(path_764)
    assert os.path.exists(path_832)
    assert os.path.exists(path_864)

    conns = collections.OrderedDict()
    conns["XP (1)"] = sqlite3.connect(path_xp32_0)
    conns["XP (2)"] = sqlite3.connect(path_xp32_1)
    conns["Vista-32"] = sqlite3.connect(path_vista32)
    conns["Vista-64"] = sqlite3.connect(path_vista64)
    conns["7-32"] = sqlite3.connect(path_732)
    conns["7-64"] = sqlite3.connect(path_764)
    conns["8-32"] = sqlite3.connect(path_832)
    conns["8-64"] = sqlite3.connect(path_864)

    cursors = dict()
    cellsets = dict()

    root_cell_metadata = []

    for osname in conns:
        logging.debug("Ingesting " + repr(osname) + ".")
        #conns[osname].isolation_level = "EXCLUSIVE"
        conns[osname].row_factory = sqlite3.Row
        cursors[osname] = conns[osname].cursor()
        cellsets[osname] = set()
        cursors[osname].execute(
            "SELECT ca.cellname, ca.type, ha.filename FROM cell_analysis AS ca, hive_analysis AS ha WHERE ca.hive_id = ha.hive_id;"
        )
        prefixes_nagged = set()
        for row in cursors[osname]:
            #Record the root cell paths for a normalizing aid report
            if row["type"] == "root":
                root_cell_metadata.append(
                    (osname, row["filename"], row["cellname"]))

            #Record the cell paths for overlap measurements
            if args.normalize:
                cellpath = normalizer.normalize_path(row["filename"],
                                                     row["cellname"])

                #Record failures to normalize
                if cellpath == row["cellname"]:
                    prefix = normalizer.extract_prefix(cellpath)
                    if prefix not in prefixes_nagged:
                        prefixes_nagged.add(prefix)

                        logging.warning(
                            "This prefix failed to normalize: %r." % prefix)
                        logging.info("The image being ingested is: %r." %
                                     osname)
                        logging.info(
                            "It came from the hive at this path: %r." %
                            row["filename"])
            else:
                cellpath = row["cellname"]
            cellsets[osname].add(cellpath)
        logging.debug("Ingested %d cell paths." % len(cellsets[osname]))

    win7_overlaps = set()
    thetable = dict()
    for osnamea in conns:
        for osnameb in conns:
            if osnamea == osnameb:
                thetally = len(cellsets[osnamea])
            else:
                the_overlaps = set.intersection(cellsets[osnamea],
                                                cellsets[osnameb])
                if set([osnamea, osnameb]) == set(["7-32", "7-64"]):
                    win7_overlaps.update(the_overlaps)
                thetally = len(the_overlaps)
            thetable[(osnamea, osnameb)] = thetally
            thetable[(osnameb, osnamea)] = thetally

    _logger.debug("The table: %r." % thetable)

    osnames = [osname for osname in conns]

    with open(args.out_pickle, "wb") as out_fh:
        pickler = pickle.Pickler(out_fh)
        out_dict = {
            "osnames": osnames,
            "conns": {key: conns[key]
                      for key in osnames},
            "thetable": thetable
        }
        pickler.dump(out_dict)
def main():
    conn = sqlite3.connect(args.out_db)
    conn.row_factory = sqlite3.Row
    rcursor = conn.cursor()
    wcursor = conn.cursor()

    _logger.debug("Attaching Registry state database %r." % args.rss_db)
    wcursor.execute("ATTACH DATABASE '%s' AS rss;" % args.rss_db)

    #Try to reduce seek times
    wcursor.execute("PRAGMA cache_size = 786432;")  #3/4 GiB
    wcursor.execute("PRAGMA rss.cache_size = 1048576;")  #1 GiB

    _logger.debug("Populating all-parent-path table...")
    wcursor.execute(SQL_CREATE_TEMP_CELL_PARENT)
    rcursor.execute("""\
SELECT
  filename,
  cellname,
  basename,
  type
FROM
  rss.cell_analysis AS c,
  rss.hive_analysis AS h
WHERE
  c.hive_id = h.hive_id
;""")
    for row in rcursor:
        cellpath = row["cellname"]
        if args.normalize:
            cellpath = normalizer.normalize_path(row["filename"], cellpath)

        if row["type"] == "root":
            #Root cell has no parent.
            parentpath = None
            basename = cellpath[1:]  #Trim leading backslash
        else:
            basename = row["basename"]
            if basename is None:
                raise ValueError(
                    "Existence assumption violated: Null basename for cellpath %r, hive %r."
                    % (row["cellname"], row["filename"]))
            #Trim trailing backslash as well
            parentpath = cellpath[0:-len(basename) - 1]

        wcursor.execute(
            "INSERT INTO temp_cell_parent(cellpath, basename, parentpath) VALUES (?,?,?);",
            (cellpath, basename, parentpath))
    _logger.debug("Done populating all-parent-path table.")

    _logger.debug(
        "Sorting and counting cell parent metadata from temporary table...")
    wcursor.execute(SQL_CREATE_CELL_PARENT)

    _logger.debug(
        "Done sorting and counting cell parent metadata from temporary table.")

    _logger.debug("Creating all-parent-path index...")
    wcursor.execute(SQL_CREATE_CELL_PARENT_INDEX)
    _logger.debug("Done creating all-parent-path index.")

    conn.commit()
def main():
    global args

    if not args.cell_parent_pickle is None:
        _logger.debug("Loading cell parent map...")
        with open(args.cell_parent_pickle, "rb") as cp_fh:
            unpickler = pickle.Unpickler(cp_fh)
            n_grammer.parent_map = unpickler.load()
        _logger.debug("Done loading cell parent map.")

    meta_conn = sqlite3.connect(":memory:")
    meta_conn.row_factory = sqlite3.Row
    meta_cursor = meta_conn.cursor()

    meta_cursor.execute("ATTACH DATABASE '%s' AS namedsequence;" % args.namedsequence_db)
    meta_cursor.execute("ATTACH DATABASE '%s' AS slice;" % args.slice_db)

    #Try to reduce seek times
    #cursor.execute("PRAGMA cache_size = 2097152;") #2 GiB

    engine = TFIDFEngine.BasicTFIDFEngine()

    #Load stop list (note that path normalizing has already occurred before load time)
    (path_stop_list, term_stop_list, term_threshold) = stop_list_structs(args.stop_list_db, args.stop_list_n_gram_strategy)

    #Key: Document name.
    #Value: List of term lists (lists because terms can repeat in a single change set).
    sequence_term_lists = collections.defaultdict(list)

    #Assemble the lists of documents that should exist.
    #This will require the namedsequence and slice tables (slice table for slice types).
    #(NOTE: The query excludes beginning-of-sequence nodes, because their predecessors are NULL and thus won't match in a join on equality.  This is intentional behavior.)
    meta_cursor.execute("""\
SELECT
  ns.*,
  s.slicetype
FROM
  namedsequence.namedsequence AS ns,
  slice.slice AS s
WHERE
  ns.osetid = s.osetid AND
  ns.appetid = s.appetid AND
  ns.sliceid = s.sliceid AND
  ns.sequencelabel LIKE '""" + args.prefix + """-%'
;""")
    for meta_row in meta_cursor:
        node_id = "%(osetid)s-%(appetid)s-%(sliceid)d" % meta_row
        predecessor_node_id = "%(predecessor_osetid)s-%(predecessor_appetid)s-%(predecessor_sliceid)d" % meta_row

        if args.by_app:
            doc_name = "%(appetid)s/%(slicetype)s" % meta_row
        elif args.by_osapp:
            doc_name = "%(osetid)s/%(appetid)s/%(slicetype)s" % meta_row
        else:
            raise NotImplementedError("Only --by-app and --by-osapp are implemented.")

        new_cell_db_path = os.path.join(args.dwf_results_root, "by_edge", predecessor_node_id, node_id, "make_registry_diff_db.sh", "registry_new_cellnames.db")
        _logger.debug("Ingesting new-cell database %r." % new_cell_db_path)
        with sqlite3.connect(new_cell_db_path) as new_cell_db_conn:
            new_cell_db_conn.row_factory = sqlite3.Row
            new_cell_db_cursor = new_cell_db_conn.cursor()
            new_cell_db_cursor.execute("""\
SELECT
  filename,
  cellname
FROM
  hive_analysis AS h,
  cell_analysis AS c
WHERE
  h.hive_id = c.hive_id
;""")
            rows_procced = 0
            num_terms_added = 0
            current_term_list = []
            for row in new_cell_db_cursor:
                if args.normalize:
                    cellpath = normalizer.normalize_path(row["filename"], row["cellname"])
                else:
                    cellpath = row["cellname"]
                #Filter according to n-gram and stop list interaction.
                if args.stop_list_n_gram_strategy == "raw_filter":
                    if cellpath in path_stop_list:
                        continue

                derived_terms = []
                if args.n_gram_length is None:
                    derived_terms.append(cellpath)
                else:
                    for n_gram in n_grammer.n_grams(cellpath, int(args.n_gram_length), args.last_n):
                        derived_terms.append(n_gram)

                for derived_term in derived_terms:
                    if args.stop_list_n_gram_strategy == "n_gram_blacklist":
                        if derived_term in term_stop_list:
                            continue
                    current_term_list.append(derived_term)
                    num_terms_added += 1
                rows_procced += 1
            sequence_term_lists[doc_name].append(current_term_list)
            _logger.debug("    Added %d terms to sequence term list, derived from %d cell paths." % (num_terms_added, rows_procced))

    meta_cursor.execute("DETACH DATABASE namedsequence;")
    meta_cursor.execute("DETACH DATABASE slice;")

    #This seems to be the cleanest way to write a fold (e.g. reduce) call for many-set intersection.
    doc_names = sorted(sequence_term_lists.keys())

    #Assemble documents from change sets
    num_doc_names = len(doc_names)
    docs_to_ingest = [] #List of tuples: (Doc length, doc name, doc term list)
    for (doc_name_no, doc_name) in enumerate(doc_names):
        #Build glomming, union and/or intersection of documents.
        #(NOTE type looseness:  doc_summation is a list, the others are sets.)
        doc_union = set([])
        doc_intersection = None
        doc_summation = []
        previous_doc_len = None # Running state record for current document, not a reference to the prior document

        _logger.debug("Combining %d sequence term lists for document %r." % (len(sequence_term_lists[doc_name]), doc_name))
        for term_list in sequence_term_lists[doc_name]:
            if args.summation or args.sumint:
                doc_summation.extend(term_list)

            if args.union or args.inconsistent:
                tmpset = set(term_list)
                doc_union = doc_union.union(tmpset)

            if args.intersection or args.inconsistent or args.sumint:
                tmpset = set(term_list)
                if doc_intersection is None:
                    doc_intersection = tmpset
                else:
                    doc_intersection = doc_intersection.intersection(tmpset)
                if not previous_doc_len is None:
                    _logger.debug("    Remaining terms after intersection, doc_name %r: %d, down from %r." % (doc_name, len(doc_intersection), previous_doc_len))
                previous_doc_len = len(doc_intersection)

        #Quick hack: Type safety in case of empty term lists.
        if doc_intersection is None: doc_intersection = set([])

        #Pick document according to arguments' request.
        #The type of doc is list.
        if args.summation:
            doc = doc_summation
        elif args.union:
            doc = [term for term in doc_union]
        elif args.intersection:
            doc = [term for term in doc_intersection]
        elif args.inconsistent:
            doc = [term for term in (doc_union - doc_intersection)]
        elif args.sumint:
            doc = [term for term in doc_summation if term in doc_intersection]
        else:
            raise ValueError("Combinator parameter missing combination logic.")

        #Filter out stoplist terms according to strategy
        if args.stop_list_n_gram_strategy == "n_gram_threshold":
            doc_vector = collections.defaultdict(int)
            for term in doc:
                doc_vector[term] += 1
            for term in term_threshold.keys():
                if doc_vector.get(term) is None:
                    continue
                doc_vector[term] -= term_threshold[term]
            #Reset doc list
            doc = []
            for term in doc_vector:
                if doc_vector[term] > 0:
                    for x in range(doc_vector[term]):
                        doc.append(term)

        _logger.debug("Combined document %r (%d terms)..." % (doc_name, len(doc)))

        docs_to_ingest.append((len(doc), doc_name, doc))

    #Ingest documents into TFIDF object
    #(It is substantially slower to ingest the bigger documents first.)
    doc_ingest_count = 0
    for (doc_no, (doc_len, doc_name, doc)) in enumerate(sorted(docs_to_ingest)):
        _logger.debug("Ingesting document %r to TFIDF data object (%d terms)..." % (doc_name, len(doc)))
        engine.ingest_document(doc_name, doc)
        doc_ingest_count += 1
        _logger.debug("Done ingesting document %d of %d." % (doc_no+1, num_doc_names))

    _logger.debug("Ingested %d documents." % doc_ingest_count)
    #This is not an error, it is just exceptional but tolerable behavior.
    #if doc_ingest_count == 0:
    #    raise Exception("This model failed to ingest any documents.")

    engine.save(args.pickle)