Beispiel #1
0
def scrutinize(filelist, options):
    fingerprints = collections.defaultdict(list)
    documents = collections.defaultdict(list)

    for filename in filelist:
        data = normalizeFileLines(filename)
        #print(filename, file=sys.stderr)
        for fprint in winnowing(kgrams(tokenize(options.language, data,
                                                      options.comments,
                                                      options.endlines,
                                                      options.whitespace,
                                                      options.text),
                                    int(options.size)),
                                    int(options.window)):
            documents[filename].append(fprint)
            fingerprints[fprint.hash].append(filename)
    for document, fprints in documents.items():
        matches = collections.defaultdict(int)
        for fprint in fprints:
            for matching in fingerprints[fprint.hash]:
                matches[matching] += 1
        tmp = []
        for key, val in sorted(matches.items()):
            if val > len(fprints) * 0.1 and key != document:
                tmp.append((key, val))
        if tmp:
            print(document, ":", len(fprints))
            for key, val in tmp:
                print('   ', key, val)
Beispiel #2
0
def scrutinize(filelist, options):
    fingerprints = collections.defaultdict(list)
    documents = collections.defaultdict(list)

    for filename in filelist:
        data = normalizeFileLines(filename)
        #print(filename, file=sys.stderr)
        for fprint in winnowing(
                kgrams(
                    tokenize(options.language, data, options.comments,
                             options.endlines, options.whitespace,
                             options.text), int(options.size)),
                int(options.window)):
            documents[filename].append(fprint)
            fingerprints[fprint.hash].append(filename)
    for document, fprints in documents.items():
        matches = collections.defaultdict(int)
        for fprint in fprints:
            for matching in fingerprints[fprint.hash]:
                matches[matching] += 1
        tmp = []
        for key, val in sorted(matches.items()):
            if val > len(fprints) * 0.1 and key != document:
                tmp.append((key, val))
        if tmp:
            print(document, ":", len(fprints))
            for key, val in tmp:
                print('   ', key, val)
Beispiel #3
0
def main(argv):
    import os
    from optparse import OptionParser

    parser = OptionParser()
    parser.add_option("-l",
                      "--language",
                      dest="language",
                      default="c",
                      help="tokenize using lexer for language",
                      metavar="LANG")
    parser.add_option("-s",
                      "--size",
                      dest="size",
                      default=5,
                      help="size of each kgram",
                      metavar="N")
    parser.add_option("-c",
                      action="store_true",
                      dest="comments",
                      default=False,
                      help="consider comments when tokenizing")
    parser.add_option("-e",
                      action="store_true",
                      dest="endlines",
                      default=False,
                      help="consider endlines when tokenizing")
    parser.add_option("-w",
                      action="store_true",
                      dest="whitespace",
                      default=False,
                      help="consider whitespace when tokenizing")
    parser.add_option("-W",
                      "--window",
                      dest="window",
                      default=5,
                      help="size of the winnowing window",
                      metavar="W")
    parser.add_option("-t",
                      action="store_true",
                      dest="text",
                      default=False,
                      help="consider text when tokenizing")
    (options, args) = parser.parse_args(argv)

    if len(args) != 1:
        #        print("Please specify exactly one input file.", file=sys.stderr)
        sys.exit(os.EX_USAGE)

    with open(args[0], 'rb') as fin:
        data = fin.read()
    for fprint in winnowing(
            kgrams(
                tokenize(options.language, data, options.comments,
                         options.endlines, options.whitespace, options.text),
                int(options.size)), int(options.window)):
        print(fprint)
Beispiel #4
0
def examine(filename, options):
    fingerprints = list()

    data = normalizeFileLines(filename)
    for fprint in winnowing(
            kgrams(
                tokenize(options.language, data, options.comments,
                         options.endlines, options.whitespace, options.text),
                int(options.size)), int(options.window)):
        # Add all the fingerprints to a list and return it.
        fingerprints.append(fprint)
    return fingerprints
Beispiel #5
0
def examine(filename, options):
    fingerprints  = list()
    
    data = normalizeFileLines(filename)
    for fprint in winnowing(kgrams(tokenize(options.language, data,
                                                  options.comments,
                                                  options.endlines,
                                                  options.whitespace,
                                                  options.text),
                                                  int(options.size)),
                                                  int(options.window)):
        # Add all the fingerprints to a list and return it.
        fingerprints.append( fprint) 
    return fingerprints        
Beispiel #6
0
def main(argv):
    import os
    from optparse import OptionParser

    parser = OptionParser()
    parser.add_option(
        "-l", "--language", dest="language", default="c", help="tokenize using lexer for language", metavar="LANG"
    )
    parser.add_option("-s", "--size", dest="size", default=5, help="size of each kgram", metavar="N")
    parser.add_option(
        "-c", action="store_true", dest="comments", default=False, help="consider comments when tokenizing"
    )
    parser.add_option(
        "-e", action="store_true", dest="endlines", default=False, help="consider endlines when tokenizing"
    )
    parser.add_option(
        "-w", action="store_true", dest="whitespace", default=False, help="consider whitespace when tokenizing"
    )
    parser.add_option("-W", "--window", dest="window", default=5, help="size of the winnowing window", metavar="W")
    parser.add_option("-t", action="store_true", dest="text", default=False, help="consider text when tokenizing")
    (options, args) = parser.parse_args(argv)

    if len(args) != 1:
        #        print("Please specify exactly one input file.", file=sys.stderr)
        sys.exit(os.EX_USAGE)

    with open(args[0], "rb") as fin:
        data = fin.read()
    for fprint in winnowing(
        kgrams(
            tokenize(options.language, data, options.comments, options.endlines, options.whitespace, options.text),
            int(options.size),
        ),
        int(options.window),
    ):
        print(fprint)