Ejemplo n.º 1
0
def construct_subhash_vectors(fname, dup_map):
    """collect set of checksums per file, substituting numeric id (fno, hno)
    for text values"""

    result = []
    FnameMap.reset()        # initialize mapping tables
    ChecksumMap.reset()

    fd = open(fname)
    last_name = ""
    hash_set = []
    for text in fd:
        (val, name) = parse_md5deep_subfile_entry(text)

        if name != last_name:
            vec = construct_vector(last_name, hash_set, dup_map)
            if vec:
                result.append(vec)
            last_name = name
            hash_set = []

        hash_set.append(val)

    vec = construct_vector(name, hash_set, dup_map)
    if vec:
        result.append(vec)
    fd.close()
    return result
Ejemplo n.º 2
0
def build_graph_from_vectors(vector_set, show_subgraph=False):
    "creates top-level fraph from set of vectors"

    B = nx.Graph()
    for fno, hset in vector_set:
        B.add_node(FnameMap.encode(fno), bipartite=0)
        for hno in hset:
            if hno not in B:
                B.add_node(ChecksumMap.encode(hno), bipartite=1)
            B.add_edge(FnameMap.encode(fno), ChecksumMap.encode(hno))
    return B
Ejemplo n.º 3
0
def construct_vector(name, hash_set, dup_map):
    if name == "":         # skipping - no file
        return False
    if name in dup_map:    # skipping -- duplicate
        return False
    if len(hash_set) < 2:  # skipping -- empty or singleton
        return False
    return [FnameMap.get_id(name),
            [ChecksumMap.get_id(hval) for hval in hash_set]]
Ejemplo n.º 4
0
def prune_vectors(vector_set, min_blocks):
    "only keep vectors containing at least 1 shared checksum"
    result = []

    for fno, hset in vector_set:
        newset = []
        for hno in hset:
            if ChecksumMap.get_count(hno) > 1:
                newset.append(hno)
        if len(newset) >= min_blocks:
            result.append([fno, newset])
    return result
Ejemplo n.º 5
0
def find_conflicting_checksums(csums, graph):
    "find those block checksums that map to the same file region"
    range_sets = {}
    for hno in csums:
        range_val = ChecksumMap.get_range_using_encoded_id(hno)
        if range_val in range_sets:
            range_sets[range_val].append(hno)
        else:
            range_sets[range_val] = [hno]

    compatible = [value[0] for key, value in range_sets.items()
                  if len(value) == 1]
    # below line is pythonic, but a bit confusing.  sum used
    # to merge list of lists
    conflicting = sum([value for key, value in range_sets.items()
                       if len(value) > 1], [])
    ranges = {key: value for key, value in range_sets.items()
              if len(value) > 1}
    return compatible, conflicting, ranges
Ejemplo n.º 6
0
def resolve_csums(csums):
    resolved_checksums = [ChecksumMap.get_hval_using_encoded_id(hno)
                          for hno in csums]
    return resolved_checksums