def construct_subhash_vectors(fname, dup_map): """collect set of checksums per file, substituting numeric id (fno, hno) for text values""" result = [] FnameMap.reset() # initialize mapping tables ChecksumMap.reset() fd = open(fname) last_name = "" hash_set = [] for text in fd: (val, name) = parse_md5deep_subfile_entry(text) if name != last_name: vec = construct_vector(last_name, hash_set, dup_map) if vec: result.append(vec) last_name = name hash_set = [] hash_set.append(val) vec = construct_vector(name, hash_set, dup_map) if vec: result.append(vec) fd.close() return result
def build_graph_from_vectors(vector_set, show_subgraph=False): "creates top-level fraph from set of vectors" B = nx.Graph() for fno, hset in vector_set: B.add_node(FnameMap.encode(fno), bipartite=0) for hno in hset: if hno not in B: B.add_node(ChecksumMap.encode(hno), bipartite=1) B.add_edge(FnameMap.encode(fno), ChecksumMap.encode(hno)) return B
def construct_vector(name, hash_set, dup_map): if name == "": # skipping - no file return False if name in dup_map: # skipping -- duplicate return False if len(hash_set) < 2: # skipping -- empty or singleton return False return [FnameMap.get_id(name), [ChecksumMap.get_id(hval) for hval in hash_set]]
def resolve_file_names(files): resolved_files = [FnameMap.get_name_using_encoded_id(fno) for fno in files] return resolved_files