Example #1
0
def scrutinize(filelist, options):
    fingerprints = collections.defaultdict(list)
    documents = collections.defaultdict(list)

    for filename in filelist:
        data = normalizeFileLines(filename)
        #print(filename, file=sys.stderr)
        for fprint in winnowing(kgrams(tokenize(options.language, data,
                                                      options.comments,
                                                      options.endlines,
                                                      options.whitespace,
                                                      options.text),
                                    int(options.size)),
                                    int(options.window)):
            documents[filename].append(fprint)
            fingerprints[fprint.hash].append(filename)
    for document, fprints in documents.items():
        matches = collections.defaultdict(int)
        for fprint in fprints:
            for matching in fingerprints[fprint.hash]:
                matches[matching] += 1
        tmp = []
        for key, val in sorted(matches.items()):
            if val > len(fprints) * 0.1 and key != document:
                tmp.append((key, val))
        if tmp:
            print(document, ":", len(fprints))
            for key, val in tmp:
                print('   ', key, val)
Example #2
0
def scrutinize(filelist, options):
    fingerprints = collections.defaultdict(list)
    documents = collections.defaultdict(list)

    for filename in filelist:
        data = normalizeFileLines(filename)
        #print(filename, file=sys.stderr)
        for fprint in winnowing(
                kgrams(
                    tokenize(options.language, data, options.comments,
                             options.endlines, options.whitespace,
                             options.text), int(options.size)),
                int(options.window)):
            documents[filename].append(fprint)
            fingerprints[fprint.hash].append(filename)
    for document, fprints in documents.items():
        matches = collections.defaultdict(int)
        for fprint in fprints:
            for matching in fingerprints[fprint.hash]:
                matches[matching] += 1
        tmp = []
        for key, val in sorted(matches.items()):
            if val > len(fprints) * 0.1 and key != document:
                tmp.append((key, val))
        if tmp:
            print(document, ":", len(fprints))
            for key, val in tmp:
                print('   ', key, val)
Example #3
0
def examine(filename, options):
    fingerprints = list()

    data = normalizeFileLines(filename)
    for fprint in winnowing(
            kgrams(
                tokenize(options.language, data, options.comments,
                         options.endlines, options.whitespace, options.text),
                int(options.size)), int(options.window)):
        # Add all the fingerprints to a list and return it.
        fingerprints.append(fprint)
    return fingerprints
Example #4
0
def examine(filename, options):
    fingerprints  = list()
    
    data = normalizeFileLines(filename)
    for fprint in winnowing(kgrams(tokenize(options.language, data,
                                                  options.comments,
                                                  options.endlines,
                                                  options.whitespace,
                                                  options.text),
                                                  int(options.size)),
                                                  int(options.window)):
        # Add all the fingerprints to a list and return it.
        fingerprints.append( fprint) 
    return fingerprints