def main(): conn = sqlite3.connect(args.out_db) conn.row_factory = sqlite3.Row rcursor = conn.cursor() wcursor = conn.cursor() _logger.debug("Attaching Registry state database %r." % args.rss_db) wcursor.execute("ATTACH DATABASE '%s' AS rss;" % args.rss_db) #Try to reduce seek times wcursor.execute("PRAGMA cache_size = 786432;") #3/4 GiB wcursor.execute("PRAGMA rss.cache_size = 1048576;") #1 GiB _logger.debug("Populating all-parent-path table...") wcursor.execute(SQL_CREATE_TEMP_CELL_PARENT) rcursor.execute("""\ SELECT filename, cellname, basename, type FROM rss.cell_analysis AS c, rss.hive_analysis AS h WHERE c.hive_id = h.hive_id ;""") for row in rcursor: cellpath = row["cellname"] if args.normalize: cellpath = normalizer.normalize_path(row["filename"], cellpath) if row["type"] == "root": #Root cell has no parent. parentpath = None basename = cellpath[1:] #Trim leading backslash else: basename = row["basename"] if basename is None: raise ValueError("Existence assumption violated: Null basename for cellpath %r, hive %r." % (row["cellname"], row["filename"])) #Trim trailing backslash as well parentpath = cellpath[ 0 : -len(basename)-1 ] wcursor.execute("INSERT INTO temp_cell_parent(cellpath, basename, parentpath) VALUES (?,?,?);", (cellpath, basename, parentpath)) _logger.debug("Done populating all-parent-path table.") _logger.debug("Sorting and counting cell parent metadata from temporary table...") wcursor.execute(SQL_CREATE_CELL_PARENT) _logger.debug("Done sorting and counting cell parent metadata from temporary table.") _logger.debug("Creating all-parent-path index...") wcursor.execute(SQL_CREATE_CELL_PARENT_INDEX) _logger.debug("Done creating all-parent-path index.") conn.commit()
def _generate_cellpaths(): nodecursor.execute("""\ SELECT filename, cellname FROM hive_analysis AS h, cell_analysis AS c WHERE h.hive_id = c.hive_id ;""") for noderow in nodecursor: if args.normalize: cellpath = normalizer.normalize_path(noderow["filename"], noderow["cellname"]) else: cellpath = noderow["cellname"] yield (cellpath,)
def _generate_paths(): rcursor.execute("""\ SELECT filename, cellname FROM rss.hive_analysis AS ha, rss.cell_analysis AS ca WHERE ha.hive_id = ca.hive_id ;""") for row in rcursor: if args.normalize: query_path = normalizer.normalize_path(row["filename"], row["cellname"]) else: query_path = row["cellname"] yield (query_path,)
def _generate_cellpaths(): nodecursor.execute("""\ SELECT filename, cellname FROM hive_analysis AS h, cell_analysis AS c WHERE h.hive_id = c.hive_id ;""") for noderow in nodecursor: if args.normalize: cellpath = normalizer.normalize_path( noderow["filename"], noderow["cellname"]) else: cellpath = noderow["cellname"] yield (cellpath, )
def main(): path_xp32_0 = args.dwf_node_results_dir + "/234-1-234-1-150/format_registry_single_state.sh/registry_single_state.db" path_xp32_1 = args.dwf_node_results_dir + "/11331-2-11331-2-90/format_registry_single_state.sh/registry_single_state.db" path_vista32 = args.dwf_node_results_dir + "/8504-1-8504-1-90/format_registry_single_state.sh/registry_single_state.db" path_vista64 = args.dwf_node_results_dir + "/8504-2-8504-2-90/format_registry_single_state.sh/registry_single_state.db" path_732 = args.dwf_node_results_dir + "/9480-1-9480-1-150/format_registry_single_state.sh/registry_single_state.db" path_764 = args.dwf_node_results_dir + "/9480-2-9480-2-150/format_registry_single_state.sh/registry_single_state.db" path_832 = args.dwf_node_results_dir + "/14694-1-14694-1-60/format_registry_single_state.sh/registry_single_state.db" path_864 = args.dwf_node_results_dir + "/14694-2-14694-2-50/format_registry_single_state.sh/registry_single_state.db" _logger.debug("Inspecting path: %r." % path_xp32_0) assert os.path.exists(path_xp32_0) assert os.path.exists(path_xp32_1) _logger.debug("Inspecting path: %r." % path_vista32) assert os.path.exists(path_vista32) assert os.path.exists(path_vista64) assert os.path.exists(path_732) assert os.path.exists(path_764) assert os.path.exists(path_832) assert os.path.exists(path_864) conns = collections.OrderedDict() conns["XP (1)"] = sqlite3.connect(path_xp32_0) conns["XP (2)"] = sqlite3.connect(path_xp32_1) conns["Vista-32"] = sqlite3.connect(path_vista32) conns["Vista-64"] = sqlite3.connect(path_vista64) conns["7-32"] = sqlite3.connect(path_732) conns["7-64"] = sqlite3.connect(path_764) conns["8-32"] = sqlite3.connect(path_832) conns["8-64"] = sqlite3.connect(path_864) cursors = dict() cellsets = dict() root_cell_metadata = [] for osname in conns: logging.debug("Ingesting " + repr(osname) + ".") #conns[osname].isolation_level = "EXCLUSIVE" conns[osname].row_factory = sqlite3.Row cursors[osname] = conns[osname].cursor() cellsets[osname] = set() cursors[osname].execute("SELECT ca.cellname, ca.type, ha.filename FROM cell_analysis AS ca, hive_analysis AS ha WHERE ca.hive_id = ha.hive_id;") prefixes_nagged = set() for row in cursors[osname]: #Record the root cell paths for a normalizing aid report if row["type"] == "root": root_cell_metadata.append((osname, row["filename"], row["cellname"])) #Record the cell paths for overlap measurements if args.normalize: cellpath = normalizer.normalize_path(row["filename"], row["cellname"]) #Record failures to normalize if cellpath == row["cellname"]: prefix = normalizer.extract_prefix(cellpath) if prefix not in prefixes_nagged: prefixes_nagged.add(prefix) logging.warning("This prefix failed to normalize: %r." % prefix) logging.info("The image being ingested is: %r." % osname) logging.info("It came from the hive at this path: %r." % row["filename"]) else: cellpath = row["cellname"] cellsets[osname].add(cellpath) logging.debug("Ingested %d cell paths." % len(cellsets[osname])) win7_overlaps = set() thetable = dict() for osnamea in conns: for osnameb in conns: if osnamea == osnameb: thetally = len(cellsets[osnamea]) else: the_overlaps = set.intersection(cellsets[osnamea], cellsets[osnameb]) if set([osnamea,osnameb]) == set(["7-32","7-64"]): win7_overlaps.update(the_overlaps) thetally = len(the_overlaps) thetable[(osnamea, osnameb)] = thetally thetable[(osnameb, osnamea)] = thetally _logger.debug("The table: %r." % thetable) osnames = [osname for osname in conns] with open(args.out_pickle, "wb") as out_fh: pickler = pickle.Pickler(out_fh) out_dict = { "osnames": osnames, "conns": {key:conns[key] for key in osnames}, "thetable": thetable } pickler.dump(out_dict)
def main(): path_xp32_0 = args.dwf_node_results_dir + "/234-1-234-1-150/format_registry_single_state.sh/registry_single_state.db" path_xp32_1 = args.dwf_node_results_dir + "/11331-2-11331-2-90/format_registry_single_state.sh/registry_single_state.db" path_vista32 = args.dwf_node_results_dir + "/8504-1-8504-1-90/format_registry_single_state.sh/registry_single_state.db" path_vista64 = args.dwf_node_results_dir + "/8504-2-8504-2-90/format_registry_single_state.sh/registry_single_state.db" path_732 = args.dwf_node_results_dir + "/9480-1-9480-1-150/format_registry_single_state.sh/registry_single_state.db" path_764 = args.dwf_node_results_dir + "/9480-2-9480-2-150/format_registry_single_state.sh/registry_single_state.db" path_832 = args.dwf_node_results_dir + "/14694-1-14694-1-60/format_registry_single_state.sh/registry_single_state.db" path_864 = args.dwf_node_results_dir + "/14694-2-14694-2-50/format_registry_single_state.sh/registry_single_state.db" _logger.debug("Inspecting path: %r." % path_xp32_0) assert os.path.exists(path_xp32_0) assert os.path.exists(path_xp32_1) _logger.debug("Inspecting path: %r." % path_vista32) assert os.path.exists(path_vista32) assert os.path.exists(path_vista64) assert os.path.exists(path_732) assert os.path.exists(path_764) assert os.path.exists(path_832) assert os.path.exists(path_864) conns = collections.OrderedDict() conns["XP (1)"] = sqlite3.connect(path_xp32_0) conns["XP (2)"] = sqlite3.connect(path_xp32_1) conns["Vista-32"] = sqlite3.connect(path_vista32) conns["Vista-64"] = sqlite3.connect(path_vista64) conns["7-32"] = sqlite3.connect(path_732) conns["7-64"] = sqlite3.connect(path_764) conns["8-32"] = sqlite3.connect(path_832) conns["8-64"] = sqlite3.connect(path_864) cursors = dict() cellsets = dict() root_cell_metadata = [] for osname in conns: logging.debug("Ingesting " + repr(osname) + ".") #conns[osname].isolation_level = "EXCLUSIVE" conns[osname].row_factory = sqlite3.Row cursors[osname] = conns[osname].cursor() cellsets[osname] = set() cursors[osname].execute( "SELECT ca.cellname, ca.type, ha.filename FROM cell_analysis AS ca, hive_analysis AS ha WHERE ca.hive_id = ha.hive_id;" ) prefixes_nagged = set() for row in cursors[osname]: #Record the root cell paths for a normalizing aid report if row["type"] == "root": root_cell_metadata.append( (osname, row["filename"], row["cellname"])) #Record the cell paths for overlap measurements if args.normalize: cellpath = normalizer.normalize_path(row["filename"], row["cellname"]) #Record failures to normalize if cellpath == row["cellname"]: prefix = normalizer.extract_prefix(cellpath) if prefix not in prefixes_nagged: prefixes_nagged.add(prefix) logging.warning( "This prefix failed to normalize: %r." % prefix) logging.info("The image being ingested is: %r." % osname) logging.info( "It came from the hive at this path: %r." % row["filename"]) else: cellpath = row["cellname"] cellsets[osname].add(cellpath) logging.debug("Ingested %d cell paths." % len(cellsets[osname])) win7_overlaps = set() thetable = dict() for osnamea in conns: for osnameb in conns: if osnamea == osnameb: thetally = len(cellsets[osnamea]) else: the_overlaps = set.intersection(cellsets[osnamea], cellsets[osnameb]) if set([osnamea, osnameb]) == set(["7-32", "7-64"]): win7_overlaps.update(the_overlaps) thetally = len(the_overlaps) thetable[(osnamea, osnameb)] = thetally thetable[(osnameb, osnamea)] = thetally _logger.debug("The table: %r." % thetable) osnames = [osname for osname in conns] with open(args.out_pickle, "wb") as out_fh: pickler = pickle.Pickler(out_fh) out_dict = { "osnames": osnames, "conns": {key: conns[key] for key in osnames}, "thetable": thetable } pickler.dump(out_dict)
def main(): conn = sqlite3.connect(args.out_db) conn.row_factory = sqlite3.Row rcursor = conn.cursor() wcursor = conn.cursor() _logger.debug("Attaching Registry state database %r." % args.rss_db) wcursor.execute("ATTACH DATABASE '%s' AS rss;" % args.rss_db) #Try to reduce seek times wcursor.execute("PRAGMA cache_size = 786432;") #3/4 GiB wcursor.execute("PRAGMA rss.cache_size = 1048576;") #1 GiB _logger.debug("Populating all-parent-path table...") wcursor.execute(SQL_CREATE_TEMP_CELL_PARENT) rcursor.execute("""\ SELECT filename, cellname, basename, type FROM rss.cell_analysis AS c, rss.hive_analysis AS h WHERE c.hive_id = h.hive_id ;""") for row in rcursor: cellpath = row["cellname"] if args.normalize: cellpath = normalizer.normalize_path(row["filename"], cellpath) if row["type"] == "root": #Root cell has no parent. parentpath = None basename = cellpath[1:] #Trim leading backslash else: basename = row["basename"] if basename is None: raise ValueError( "Existence assumption violated: Null basename for cellpath %r, hive %r." % (row["cellname"], row["filename"])) #Trim trailing backslash as well parentpath = cellpath[0:-len(basename) - 1] wcursor.execute( "INSERT INTO temp_cell_parent(cellpath, basename, parentpath) VALUES (?,?,?);", (cellpath, basename, parentpath)) _logger.debug("Done populating all-parent-path table.") _logger.debug( "Sorting and counting cell parent metadata from temporary table...") wcursor.execute(SQL_CREATE_CELL_PARENT) _logger.debug( "Done sorting and counting cell parent metadata from temporary table.") _logger.debug("Creating all-parent-path index...") wcursor.execute(SQL_CREATE_CELL_PARENT_INDEX) _logger.debug("Done creating all-parent-path index.") conn.commit()
def main(): global args if not args.cell_parent_pickle is None: _logger.debug("Loading cell parent map...") with open(args.cell_parent_pickle, "rb") as cp_fh: unpickler = pickle.Unpickler(cp_fh) n_grammer.parent_map = unpickler.load() _logger.debug("Done loading cell parent map.") meta_conn = sqlite3.connect(":memory:") meta_conn.row_factory = sqlite3.Row meta_cursor = meta_conn.cursor() meta_cursor.execute("ATTACH DATABASE '%s' AS namedsequence;" % args.namedsequence_db) meta_cursor.execute("ATTACH DATABASE '%s' AS slice;" % args.slice_db) #Try to reduce seek times #cursor.execute("PRAGMA cache_size = 2097152;") #2 GiB engine = TFIDFEngine.BasicTFIDFEngine() #Load stop list (note that path normalizing has already occurred before load time) (path_stop_list, term_stop_list, term_threshold) = stop_list_structs(args.stop_list_db, args.stop_list_n_gram_strategy) #Key: Document name. #Value: List of term lists (lists because terms can repeat in a single change set). sequence_term_lists = collections.defaultdict(list) #Assemble the lists of documents that should exist. #This will require the namedsequence and slice tables (slice table for slice types). #(NOTE: The query excludes beginning-of-sequence nodes, because their predecessors are NULL and thus won't match in a join on equality. This is intentional behavior.) meta_cursor.execute("""\ SELECT ns.*, s.slicetype FROM namedsequence.namedsequence AS ns, slice.slice AS s WHERE ns.osetid = s.osetid AND ns.appetid = s.appetid AND ns.sliceid = s.sliceid AND ns.sequencelabel LIKE '""" + args.prefix + """-%' ;""") for meta_row in meta_cursor: node_id = "%(osetid)s-%(appetid)s-%(sliceid)d" % meta_row predecessor_node_id = "%(predecessor_osetid)s-%(predecessor_appetid)s-%(predecessor_sliceid)d" % meta_row if args.by_app: doc_name = "%(appetid)s/%(slicetype)s" % meta_row elif args.by_osapp: doc_name = "%(osetid)s/%(appetid)s/%(slicetype)s" % meta_row else: raise NotImplementedError("Only --by-app and --by-osapp are implemented.") new_cell_db_path = os.path.join(args.dwf_results_root, "by_edge", predecessor_node_id, node_id, "make_registry_diff_db.sh", "registry_new_cellnames.db") _logger.debug("Ingesting new-cell database %r." % new_cell_db_path) with sqlite3.connect(new_cell_db_path) as new_cell_db_conn: new_cell_db_conn.row_factory = sqlite3.Row new_cell_db_cursor = new_cell_db_conn.cursor() new_cell_db_cursor.execute("""\ SELECT filename, cellname FROM hive_analysis AS h, cell_analysis AS c WHERE h.hive_id = c.hive_id ;""") rows_procced = 0 num_terms_added = 0 current_term_list = [] for row in new_cell_db_cursor: if args.normalize: cellpath = normalizer.normalize_path(row["filename"], row["cellname"]) else: cellpath = row["cellname"] #Filter according to n-gram and stop list interaction. if args.stop_list_n_gram_strategy == "raw_filter": if cellpath in path_stop_list: continue derived_terms = [] if args.n_gram_length is None: derived_terms.append(cellpath) else: for n_gram in n_grammer.n_grams(cellpath, int(args.n_gram_length), args.last_n): derived_terms.append(n_gram) for derived_term in derived_terms: if args.stop_list_n_gram_strategy == "n_gram_blacklist": if derived_term in term_stop_list: continue current_term_list.append(derived_term) num_terms_added += 1 rows_procced += 1 sequence_term_lists[doc_name].append(current_term_list) _logger.debug(" Added %d terms to sequence term list, derived from %d cell paths." % (num_terms_added, rows_procced)) meta_cursor.execute("DETACH DATABASE namedsequence;") meta_cursor.execute("DETACH DATABASE slice;") #This seems to be the cleanest way to write a fold (e.g. reduce) call for many-set intersection. doc_names = sorted(sequence_term_lists.keys()) #Assemble documents from change sets num_doc_names = len(doc_names) docs_to_ingest = [] #List of tuples: (Doc length, doc name, doc term list) for (doc_name_no, doc_name) in enumerate(doc_names): #Build glomming, union and/or intersection of documents. #(NOTE type looseness: doc_summation is a list, the others are sets.) doc_union = set([]) doc_intersection = None doc_summation = [] previous_doc_len = None # Running state record for current document, not a reference to the prior document _logger.debug("Combining %d sequence term lists for document %r." % (len(sequence_term_lists[doc_name]), doc_name)) for term_list in sequence_term_lists[doc_name]: if args.summation or args.sumint: doc_summation.extend(term_list) if args.union or args.inconsistent: tmpset = set(term_list) doc_union = doc_union.union(tmpset) if args.intersection or args.inconsistent or args.sumint: tmpset = set(term_list) if doc_intersection is None: doc_intersection = tmpset else: doc_intersection = doc_intersection.intersection(tmpset) if not previous_doc_len is None: _logger.debug(" Remaining terms after intersection, doc_name %r: %d, down from %r." % (doc_name, len(doc_intersection), previous_doc_len)) previous_doc_len = len(doc_intersection) #Quick hack: Type safety in case of empty term lists. if doc_intersection is None: doc_intersection = set([]) #Pick document according to arguments' request. #The type of doc is list. if args.summation: doc = doc_summation elif args.union: doc = [term for term in doc_union] elif args.intersection: doc = [term for term in doc_intersection] elif args.inconsistent: doc = [term for term in (doc_union - doc_intersection)] elif args.sumint: doc = [term for term in doc_summation if term in doc_intersection] else: raise ValueError("Combinator parameter missing combination logic.") #Filter out stoplist terms according to strategy if args.stop_list_n_gram_strategy == "n_gram_threshold": doc_vector = collections.defaultdict(int) for term in doc: doc_vector[term] += 1 for term in term_threshold.keys(): if doc_vector.get(term) is None: continue doc_vector[term] -= term_threshold[term] #Reset doc list doc = [] for term in doc_vector: if doc_vector[term] > 0: for x in range(doc_vector[term]): doc.append(term) _logger.debug("Combined document %r (%d terms)..." % (doc_name, len(doc))) docs_to_ingest.append((len(doc), doc_name, doc)) #Ingest documents into TFIDF object #(It is substantially slower to ingest the bigger documents first.) doc_ingest_count = 0 for (doc_no, (doc_len, doc_name, doc)) in enumerate(sorted(docs_to_ingest)): _logger.debug("Ingesting document %r to TFIDF data object (%d terms)..." % (doc_name, len(doc))) engine.ingest_document(doc_name, doc) doc_ingest_count += 1 _logger.debug("Done ingesting document %d of %d." % (doc_no+1, num_doc_names)) _logger.debug("Ingested %d documents." % doc_ingest_count) #This is not an error, it is just exceptional but tolerable behavior. #if doc_ingest_count == 0: # raise Exception("This model failed to ingest any documents.") engine.save(args.pickle)