Example #1
0
def construct_subhash_vectors(fname, dup_map):
    """collect set of checksums per file, substituting numeric id (fno, hno)
    for text values"""

    result = []
    FnameMap.reset()        # initialize mapping tables
    ChecksumMap.reset()

    fd = open(fname)
    last_name = ""
    hash_set = []
    for text in fd:
        (val, name) = parse_md5deep_subfile_entry(text)

        if name != last_name:
            vec = construct_vector(last_name, hash_set, dup_map)
            if vec:
                result.append(vec)
            last_name = name
            hash_set = []

        hash_set.append(val)

    vec = construct_vector(name, hash_set, dup_map)
    if vec:
        result.append(vec)
    fd.close()
    return result
Example #2
0
def build_graph_from_vectors(vector_set, show_subgraph=False):
    "creates top-level fraph from set of vectors"

    B = nx.Graph()
    for fno, hset in vector_set:
        B.add_node(FnameMap.encode(fno), bipartite=0)
        for hno in hset:
            if hno not in B:
                B.add_node(ChecksumMap.encode(hno), bipartite=1)
            B.add_edge(FnameMap.encode(fno), ChecksumMap.encode(hno))
    return B
Example #3
0
def construct_vector(name, hash_set, dup_map):
    if name == "":         # skipping - no file
        return False
    if name in dup_map:    # skipping -- duplicate
        return False
    if len(hash_set) < 2:  # skipping -- empty or singleton
        return False
    return [FnameMap.get_id(name),
            [ChecksumMap.get_id(hval) for hval in hash_set]]
Example #4
0
def resolve_file_names(files):
    resolved_files = [FnameMap.get_name_using_encoded_id(fno) for fno in files]
    return resolved_files