Ejemplo n.º 1
0
def scan(language, file_manifest, source_file_names):

  # Determine the files to scan. If no files are given, use a default manifest.
  if len(source_file_names) == 0 and file_manifest is None:
    file_manifest = manifest.default_manifest(language)

  source_file_names = set(source_file_names)
  if file_manifest is not None:
    source_file_names.update(set(manifest.contents(file_manifest)))

  supplier = ast_suppliers.abstract_syntax_tree_suppliers[language]

  # TODO: Configuration files!
  parameters = Parameters()
  parameters.distance_threshold = supplier.distance_threshold
  parameters.size_threshold = supplier.size_threshold

  source_files = []

  report = Report(parameters)

  def parse_file(file_name):
    try:
      logging.info('Parsing ' + file_name + '...')
      source_file = supplier(file_name, parameters)
      source_file.getTree().propagateCoveredLineNumbers()
      source_file.getTree().propagateHeight()
      source_files.append(source_file)
      report.addFileName(file_name)
      logging.info('done')
    except:
      logging.warn('Can\'t parse "%s" \n: ' % (file_name,) + traceback.format_exc())

  for file_name in source_file_names:
      parse_file(file_name)

  duplicates = clone_detection_algorithm.findDuplicateCode(source_files, report)
  n = 1
  for duplicate in duplicates:
    distance = duplicate.calcDistance()
    summary = CloneSummary(
      "Clone #"+str(n),
      [  # TODO: This is a mess! Most of this info should be assembled on the fly and in member functions.
       Snippet(
        duplicate[i].getSourceFile()._file_name,
        duplicate[i].getCoveredLineNumbers(),
        '\n'.join([line for line in duplicate[i].getSourceLines()])
        ) for i in [0, 1]], distance)
    report.addClone(summary)
    n += 1
  report.sortByCloneSize()

  save_report(".orphanblack", report)
Ejemplo n.º 2
0
def main():
    cmdline = OptionParser(usage="""To run Clone Digger type:
python clonedigger.py [OPTION]... [SOURCE FILE OR DIRECTORY]...

The typical usage is:
python clonedigger.py source_file_1 source_file_2 ...
  or
python clonedigger.py path_to_source_tree
Don't forget to remove automatically generated sources, tests and third party libraries from the source tree.

Notice:
The semantics of threshold options is discussed in the paper "Duplicate code detection using anti-unification", which can be downloaded from the site http://clonedigger.sourceforge.net . All arguments are optional. Supported options are: 
""")
    cmdline.add_option('-l',
                       '--language',
                       dest='language',
                       type='choice',
                       choices=['python', 'java', 'lua', 'javascript', 'js'],
                       help='the programming language')
    cmdline.add_option('--no-recursion',
                       dest='no_recursion',
                       action='store_true',
                       help='do not traverse directions recursively')
    cmdline.add_option(
        '-o',
        '--output',
        dest='output',
        help='the name of the output file ("output.html" by default)')
    cmdline.add_option('--clustering-threshold',
                       type='int',
                       dest='clustering_threshold',
                       help='read the paper for semantics')
    cmdline.add_option(
        '--distance-threshold',
        type='int',
        dest='distance_threshold',
        help=
        'the maximum amount of differences between pair of sequences in clone pair (5 by default). Larger value leads to larger amount of false positives'
    )
    cmdline.add_option(
        '--hashing-depth',
        type='int',
        dest='hashing_depth',
        help=
        'default value if 1, read the paper for semantics. Computation can be speeded up by increasing this value (but some clones can be missed)'
    )
    cmdline.add_option(
        '--size-threshold',
        type='int',
        dest='size_threshold',
        help=
        'the minimum clone size. The clone size for its turn is equal to the count of lines of code in its the largest fragment'
    )
    cmdline.add_option(
        '--clusterize-using-dcup',
        action='store_true',
        dest='clusterize_using_dcup',
        help=
        'mark each statement with its D-cup value instead of the most similar pattern. This option together with --hashing-depth=0 make it possible to catch all considered clones (but it is slow and applicable only to small programs)'
    )
    cmdline.add_option('--dont-print-time',
                       action='store_false',
                       dest='print_time',
                       help='do not print time')
    cmdline.add_option('-f',
                       '--force',
                       action='store_true',
                       dest='force',
                       help='')
    cmdline.add_option(
        '--force-diff',
        action='store_true',
        dest='use_diff',
        help='force highlighting of differences based on the diff algorithm')
    cmdline.add_option(
        '--fast',
        action='store_true',
        dest='clusterize_using_hash',
        help=
        'find only clones, which differ in variable and function names and constants'
    )
    cmdline.add_option('--ignore-dir',
                       action='append',
                       dest='ignore_dirs',
                       help='exclude directories from parsing')
    cmdline.add_option('--eclipse-output',
                       dest='eclipse_output',
                       help='for internal usage only')
    cmdline.add_option(
        '--cpd-output',
        action='store_true',
        dest='cpd_output',
        help='output as PMD'
        's CPD'
        's XML format. If output file not defined, output.xml is generated')
    cmdline.add_option('--report-unifiers',
                       action='store_true',
                       dest='report_unifiers',
                       help='')
    cmdline.add_option(
        '--func-prefixes',
        action='store',
        dest='f_prefixes',
        help=
        'skip functions/methods with these prefixes (provide a CSV string as argument)'
    )
    cmdline.add_option(
        '--file-list',
        dest='file_list',
        help=
        'a file that contains a list of file names that must be processed by Clone Digger'
    )

    cmdline.set_defaults(language='python',
                         ingore_dirs=[],
                         f_prefixes=None,
                         **arguments.__dict__)

    (options, source_file_names) = cmdline.parse_args()
    if options.f_prefixes != None:
        func_prefixes = tuple(
            [x.strip() for x in options.f_prefixes.split(',')])
    else:
        func_prefixes = ()
    source_files = []

    supplier = ast_suppliers.abstract_syntax_tree_suppliers[options.language]
    if options.language != 'python':
        options.use_diff = True

    if options.cpd_output:
        if options.output is None:
            options.output = 'output.xml'
        report = html_report.CPDXMLReport()
    else:
        report = html_report.HTMLReport()

    if options.output is None:
        options.output = 'output.html'

    output_file_name = options.output

    for option in cmdline.option_list:
        if option.dest == 'file_list' and options.file_list != None:
            source_file_names.extend(open(options.file_list).read().split())
            continue
        elif option.dest is None:
            continue
        setattr(arguments, option.dest, getattr(options, option.dest))

    if options.distance_threshold is None:
        arguments.distance_threshold = supplier.distance_threshold
    if options.size_threshold is None:
        arguments.size_threshold = supplier.size_threshold

    report.startTimer('Construction of AST')

    def parse_file(file_name, func_prefixes):
        try:
            print 'Parsing ', file_name, '...',
            sys.stdout.flush()
            if options.language == 'python':
                source_file = supplier(file_name, func_prefixes)
            else:
                # TODO implement func_prefixes for java also
                source_file = supplier(file_name)
            source_file.getTree().propagateCoveredLineNumbers()
            source_file.getTree().propagateHeight()
            source_files.append(source_file)
            report.addFileName(file_name)
            print 'done'
        except:
            s = 'Error: can\'t parse "%s" \n: ' % (
                file_name, ) + traceback.format_exc()
            report.addErrorInformation(s)
            print s

    def walk(dirname):
        for dirpath, dirs, files in os.walk(file_name):
            dirs[:] = (not options.ignore_dirs and dirs) or [
                d for d in dirs if d not in options.ignore_dirs
            ]
            # Skip all non-parseable files
            files[:] = [
                f for f in files
                if os.path.splitext(f)[1][1:] == supplier.extension
            ]
            yield (dirpath, dirs, files)

    for file_name in source_file_names:
        if os.path.isdir(file_name):
            if arguments.no_recursion:
                dirpath = file_name
                files = [
                    os.path.join(file_name, f) for f in os.listdir(file_name)
                    if os.path.splitext(f)[1][1:] == supplier.extension
                ]
                for f in files:
                    parse_file(f, func_prefixes)
            else:
                for dirpath, dirnames, filenames in walk(file_name):
                    for f in filenames:
                        parse_file(os.path.join(dirpath, f), func_prefixes)
        else:
            parse_file(file_name, func_prefixes)

    report.stopTimer()
    duplicates = clone_detection_algorithm.findDuplicateCode(
        source_files, report)
    for duplicate in duplicates:
        report.addClone(duplicate)
    report.sortByCloneSize()
    try:
        report.writeReport(output_file_name)
    except:
        print "catched error, removing output file"
        if os.path.exists(output_file_name):
            os.remove(output_file_name)
        raise
Ejemplo n.º 3
0
def main():
    cmdline = OptionParser(usage="""To run Clone Digger type:
python clonedigger.py [OPTION]... [SOURCE FILE OR DIRECTORY]...

The typical usage is:
python clonedigger.py source_file_1 source_file_2 ...
  or
python clonedigger.py path_to_source_tree
Don't forget to remove automatically generated sources, tests and third party libraries from the source tree.

Notice:
The semantics of threshold options is discussed in the paper "Duplicate code detection using anti-unification", which can be downloaded from the site http://clonedigger.sourceforge.net . All arguments are optional. Supported options are: 
""")
    cmdline.add_option('-l', '--language', dest='language',
                       type='choice', choices=['python', 'java', 'lua', 'javascript', 'js'],
                       help='the programming language')
    cmdline.add_option('--no-recursion', dest='no_recursion',
                       action='store_true', 
                       help='do not traverse directions recursively')    
    cmdline.add_option('-o', '--output', dest='output',
                       help='the name of the output file ("output.html" by default)')
    cmdline.add_option('--clustering-threshold', 
                       type='int', dest='clustering_threshold',
                       help='read the paper for semantics')
    cmdline.add_option('--distance-threshold', 
                       type='int', dest='distance_threshold',
                       help='the maximum amount of differences between pair of sequences in clone pair (5 by default). Larger value leads to larger amount of false positives')
    cmdline.add_option('--hashing-depth', 
                       type='int', dest='hashing_depth',
                       help='default value if 1, read the paper for semantics. Computation can be speeded up by increasing this value (but some clones can be missed)')
    cmdline.add_option('--size-threshold', 
                       type='int', dest='size_threshold',
                       help='the minimum clone size. The clone size for its turn is equal to the count of lines of code in its the largest fragment')
    cmdline.add_option('--clusterize-using-dcup', 
                       action='store_true', dest='clusterize_using_dcup',
                       help='mark each statement with its D-cup value instead of the most similar pattern. This option together with --hashing-depth=0 make it possible to catch all considered clones (but it is slow and applicable only to small programs)')
    cmdline.add_option('--dont-print-time', 
                       action='store_false', dest='print_time',
                       help='do not print time')
    cmdline.add_option('-f', '--force', 
                       action='store_true', dest='force',
                       help='')
    cmdline.add_option('--force-diff', 
                       action='store_true', dest='use_diff',
                       help='force highlighting of differences based on the diff algorithm')
    cmdline.add_option('--fast', 
                       action='store_true', dest='clusterize_using_hash',
                       help='find only clones, which differ in variable and function names and constants')
    cmdline.add_option('--ignore-dir', 
                       action='append', dest='ignore_dirs',
                       help='exclude directories from parsing')
    cmdline.add_option('--eclipse-output', 
                       dest='eclipse_output',
                       help='for internal usage only')
    cmdline.add_option('--cpd-output', 
                       action='store_true', dest='cpd_output',
                       help='output as PMD''s CPD''s XML format. If output file not defined, output.xml is generated')
    cmdline.add_option('--report-unifiers', 
                       action='store_true', dest='report_unifiers',
                       help='')
    cmdline.add_option('--func-prefixes',
                      action='store',
                       dest='f_prefixes',
                      help='skip functions/methods with these prefixes (provide a CSV string as argument)')
    cmdline.add_option('--file-list', dest='file_list',
                      help='a file that contains a list of file names that must be processed by Clone Digger')

    cmdline.set_defaults(language='python', 
                         ingore_dirs=[],
                         f_prefixes = None,
                         **arguments.__dict__)

    (options, source_file_names) = cmdline.parse_args()
    if options.f_prefixes != None:
       func_prefixes = tuple([x.strip() for x in options.f_prefixes.split(',')])
    else:
       func_prefixes = ()
    source_files = [] 

    supplier = ast_suppliers.abstract_syntax_tree_suppliers[options.language]
    if options.language != 'python':
        options.use_diff = True

    if options.cpd_output:
        if options.output is None:
	    options.output = 'output.xml'
	report = html_report.CPDXMLReport()
    else:
    	report = html_report.HTMLReport()    

    if options.output is None:
    	options.output = 'output.html'

    output_file_name = options.output

    for option in cmdline.option_list:
        if option.dest == 'file_list' and options.file_list != None:           
            source_file_names.extend(open(options.file_list).read().split())
            continue
        elif option.dest is None:
            continue
        setattr(arguments, option.dest, getattr(options, option.dest))

    if options.distance_threshold is None:
        arguments.distance_threshold = supplier.distance_threshold
    if options.size_threshold is None:
        arguments.size_threshold = supplier.size_threshold
    
    report.startTimer('Construction of AST')

    def parse_file(file_name, func_prefixes):
        try:
            print 'Parsing ', file_name, '...',
            sys.stdout.flush()
            if options.language=='python':
                source_file = supplier(file_name, func_prefixes)
            else:
                # TODO implement func_prefixes for java also
                source_file = supplier(file_name)
            source_file.getTree().propagateCoveredLineNumbers()
            source_file.getTree().propagateHeight()
            source_files.append(source_file)
            report.addFileName(file_name)                
            print 'done'
        except:
            s = 'Error: can\'t parse "%s" \n: '%(file_name,) + traceback.format_exc()
            report.addErrorInformation(s)
            print s

    def walk(dirname):
        for dirpath, dirs, files in os.walk(file_name):
            dirs[:] = (not options.ignore_dirs and dirs)  or [d for d in dirs if d not in options.ignore_dirs]
            # Skip all non-parseable files
            files[:] = [f for f in files 
                        if os.path.splitext(f)[1][1:] == supplier.extension]
            yield (dirpath, dirs, files)

    for file_name in source_file_names:
        if os.path.isdir(file_name):
            if arguments.no_recursion:
                dirpath = file_name
                files = [os.path.join(file_name, f) for f in os.listdir(file_name) 
                        if os.path.splitext(f)[1][1:] == supplier.extension]
                for f in files:
                    parse_file(f, func_prefixes)
            else:
                for dirpath, dirnames, filenames in walk(file_name):
                    for f in filenames:
                        parse_file(os.path.join(dirpath, f), func_prefixes)
        else:
            parse_file(file_name, func_prefixes)
        
    report.stopTimer()
    duplicates = clone_detection_algorithm.findDuplicateCode(source_files, report)
    for duplicate in duplicates:
        report.addClone(duplicate)
    report.sortByCloneSize()
    try:
        report.writeReport(output_file_name)
    except:
        print "catched error, removing output file"
        if os.path.exists(output_file_name):
            os.remove(output_file_name)
        raise