def scrutinize(filelist, options): fingerprints = collections.defaultdict(list) documents = collections.defaultdict(list) for filename in filelist: data = normalizeFileLines(filename) #print(filename, file=sys.stderr) for fprint in winnowing(kgrams(tokenize(options.language, data, options.comments, options.endlines, options.whitespace, options.text), int(options.size)), int(options.window)): documents[filename].append(fprint) fingerprints[fprint.hash].append(filename) for document, fprints in documents.items(): matches = collections.defaultdict(int) for fprint in fprints: for matching in fingerprints[fprint.hash]: matches[matching] += 1 tmp = [] for key, val in sorted(matches.items()): if val > len(fprints) * 0.1 and key != document: tmp.append((key, val)) if tmp: print(document, ":", len(fprints)) for key, val in tmp: print(' ', key, val)
def scrutinize(filelist, options): fingerprints = collections.defaultdict(list) documents = collections.defaultdict(list) for filename in filelist: data = normalizeFileLines(filename) #print(filename, file=sys.stderr) for fprint in winnowing( kgrams( tokenize(options.language, data, options.comments, options.endlines, options.whitespace, options.text), int(options.size)), int(options.window)): documents[filename].append(fprint) fingerprints[fprint.hash].append(filename) for document, fprints in documents.items(): matches = collections.defaultdict(int) for fprint in fprints: for matching in fingerprints[fprint.hash]: matches[matching] += 1 tmp = [] for key, val in sorted(matches.items()): if val > len(fprints) * 0.1 and key != document: tmp.append((key, val)) if tmp: print(document, ":", len(fprints)) for key, val in tmp: print(' ', key, val)
def examine(filename, options): fingerprints = list() data = normalizeFileLines(filename) for fprint in winnowing( kgrams( tokenize(options.language, data, options.comments, options.endlines, options.whitespace, options.text), int(options.size)), int(options.window)): # Add all the fingerprints to a list and return it. fingerprints.append(fprint) return fingerprints
def examine(filename, options): fingerprints = list() data = normalizeFileLines(filename) for fprint in winnowing(kgrams(tokenize(options.language, data, options.comments, options.endlines, options.whitespace, options.text), int(options.size)), int(options.window)): # Add all the fingerprints to a list and return it. fingerprints.append( fprint) return fingerprints