def syncdir(srcdir, destdir): srcdir = normpath(srcdir) destdir = normpath(destdir) if not isdir(destdir): sys.exit('destination directory "%s" does not exist' % destdir) for root, dirs, files in os.walk(srcdir): root = normpath(root) prefix = os.path.commonprefix([root, srcdir]) root = root[len(prefix):] if root.startswith('/'): root = root[1:] for rem in [ d for d in dirs if d.startswith('.') or d == 'SCCS']: dirs.remove(rem) for entry in dirs: newdir = joinpath(destdir, root, entry) if not isdir(newdir): os.mkdir(newdir) print 'mkdir', newdir for i,d in enumerate(dirs): if islink(joinpath(srcdir, root, d)): dirs[i] = joinpath(d, '.') for entry in files: dest = normpath(joinpath(destdir, root, entry)) src = normpath(joinpath(srcdir, root, entry)) if not isfile(dest) or not filecmp(src, dest): print 'copy %s %s' % (dest, src) copy(src, dest)
def syncdir(srcdir, destdir): srcdir = normpath(srcdir) destdir = normpath(destdir) if not isdir(destdir): sys.exit('destination directory "%s" does not exist' % destdir) for root, dirs, files in os.walk(srcdir): root = normpath(root) prefix = os.path.commonprefix([root, srcdir]) root = root[len(prefix):] if root.startswith('/'): root = root[1:] for rem in [d for d in dirs if d.startswith('.') or d == 'SCCS']: dirs.remove(rem) for entry in dirs: newdir = joinpath(destdir, root, entry) if not isdir(newdir): os.mkdir(newdir) print 'mkdir', newdir for i, d in enumerate(dirs): if islink(joinpath(srcdir, root, d)): dirs[i] = joinpath(d, '.') for entry in files: dest = normpath(joinpath(destdir, root, entry)) src = normpath(joinpath(srcdir, root, entry)) if not isfile(dest) or not filecmp(src, dest): print 'copy %s %s' % (dest, src) copy(src, dest)
def _binarycmp(filelist, onerror): file0, file1 = filelist try: if filecmp(file0.path, file1.path, shallow=False): dupdict = {True: filelist} else: dupdict = {} errlist = [] except (IOError, OSError) as exc: if onerror is not None: onerror(exc, abspath(exc.filename)) dupdict = {} errlist = filelist return dupdict, errlist
def copyfiles(srcdir, dstdir): from filecmp import cmp as filecmp from shutil import copyfile srcdir = normpath(srcdir) dstdir = normpath(dstdir) if not isdir(dstdir): os.mkdir(dstdir) for root, dirs, files in os.walk(srcdir): root = normpath(root) prefix = os.path.commonprefix([root, srcdir]) root = root[len(prefix):] if root.startswith('/'): root = root[1:] for entry in dirs: newdir = joinpath(dstdir, root, entry) if not isdir(newdir): os.mkdir(newdir) for entry in files: dest = normpath(joinpath(dstdir, root, entry)) src = normpath(joinpath(srcdir, root, entry)) if not isfile(dest) or not filecmp(src, dest): copyfile(src, dest) # some of the spec benchmarks expect to be run from one directory up. # just create some symlinks that solve the problem inlink = joinpath(dstdir, 'input') outlink = joinpath(dstdir, 'output') if not exists(inlink): os.symlink('.', inlink) if not exists(outlink): os.symlink('.', outlink)
def main(): opt_h = OptionHelper() if opt_h.cascade(): input_type = opt_h.input_type() if opt_h.output() is None: opt_h.set_output('out') opt_h.check_output() cascade_dir = '%s/cascade00' % opt_h.output() makedirs(cascade_dir) results_fh = open('%s/results' % opt_h.output(), 'w') cascade_train_out = '%s/train-out' % cascade_dir cascade_test_out = '%s/test-out' % cascade_dir basic_cmd = opt_h.basic_cmd() output_file_type = ' -outputType UNDERSCORE4CCL' log('running initial chunking') run_cmd(basic_cmd \ + opt_h.starter_train() \ + opt_h.starter_train_out() \ + output_file_type \ + ' -output ' + cascade_train_out, \ verbose=opt_h.verbose()) run_cmd(basic_cmd \ + opt_h.starter_train() \ + opt_h.starter_test() \ + opt_h.filter_flag() \ + output_file_type \ + ' -output ' + cascade_test_out, \ verbose=opt_h.verbose()) cascade_iter = 1 new_cascade_train_out_fname = get_output_fname(cascade_train_out) cascade_expand_last = None while True: # convert test output to trees cascade_test_out_fname = get_output_fname(cascade_test_out) cascade_expand = [] log('building corpus record from ' + cascade_test_out) for s_ind, sentence in enumerate(open(cascade_test_out_fname)): i = 0 sentence_str = [] for chunk in sentence.split(): chunk = chunk.split('_') chunk_str = [] for word in chunk: if word.startswith('=') and len(word) > 1: chunk_str.append(cascade_expand_last[s_ind][i]) else: chunk_str.append(word) i += 1 if len(chunk) == 1: sentence_str.append(chunk_str[0]) else: sentence_str.append('(' + (' '.join(chunk_str)) + ')') cascade_expand.append(sentence_str) cascade_test_eval_fname = cascade_dir + '/test-eval' eval_fh = open(cascade_test_eval_fname, 'w') for sent in cascade_expand: print >>eval_fh, '(' + (' '.join(sent)).replace(' ;', '') + ')' eval_fh.close() # evaluate test output as trees run_cmd(opt_h.eval_cmd() \ + opt_h.starter_test() \ + ' -cclpOutput ' + cascade_test_eval_fname \ + opt_h.filter_flag(), fh=results_fh, \ verbose=opt_h.verbose()) cascade_expand_last = cascade_expand log('running cascade level ' + str(cascade_iter)) # build term frequency map from last train output cascade_train_out_fname = new_cascade_train_out_fname phrasal_terms = PhrasalTerms(cascade_train_out_fname) # create next-run train next_run_train_fname = cascade_dir + '/next-train' phrasal_terms.write_new_dataset(cascade_train_out_fname, \ next_run_train_fname) # run chunker, output re-chunked train new_cascade_dir = '%s/cascade%02d' % (opt_h.output(), cascade_iter) makedirs(new_cascade_dir) cascade_train_out = '%s/train-out' % new_cascade_dir run_cmd(basic_cmd \ + ' -train ' + next_run_train_fname \ + ' -trainFileType SPL ' \ + ' -test ' + next_run_train_fname \ + ' -testFileType SPL ' \ + output_file_type \ + ' -output ' + cascade_train_out, verbose=opt_h.verbose()) # if re-chunked train is the same as orig, break new_cascade_train_out_fname = get_output_fname(cascade_train_out) if filecmp(cascade_train_out_fname, new_cascade_train_out_fname): break # create next-run test cascade_test_out = '%s/test-out' % new_cascade_dir next_run_test_fname = cascade_dir + '/next-test' phrasal_terms.write_new_dataset(cascade_test_out_fname, \ next_run_test_fname) # run the chunker, output re-chunked test run_cmd(basic_cmd \ + ' -train ' + next_run_train_fname \ + ' -trainFileType SPL ' \ + ' -test ' + next_run_test_fname \ + ' -testFileType SPL ' \ + output_file_type \ + ' -output ' + cascade_test_out, verbose=opt_h.verbose()) cascade_dir = new_cascade_dir cascade_iter += 1 results_fh.close() else: cmd = opt_h.basic_cmd() output_flag = '' if opt_h.stdout(): output_flag = ' -output -' elif opt_h.output() is not None: opt_h.check_output() output_flag = ' -output ' + opt_h.output() cmd += ' -outputType ' + opt_h.output_type() cmd += output_flag cmd += opt_h.starter_train() cmd += opt_h.starter_test() cmd += opt_h.filter_flag() cmd += ' -E PRCL -e CLUMP,NPS,TREEBANKPREC' run_cmd(cmd, verbose=opt_h.verbose())
def main(): opt_h = OptionHelper() if opt_h.cascade(): input_type = opt_h.input_type() if opt_h.output() is None: opt_h.set_output('out') opt_h.check_output() cascade_dir = '%s/cascade00' % opt_h.output() makedirs(cascade_dir) results_fh = open('%s/results' % opt_h.output(), 'w') cascade_train_out = '%s/train-out' % cascade_dir cascade_test_out = '%s/test-out' % cascade_dir basic_cmd = opt_h.basic_cmd() output_file_type = ' -outputType UNDERSCORE4CCL' log('running initial chunking') run_cmd(basic_cmd \ + opt_h.starter_train() \ + opt_h.starter_train_out() \ + output_file_type \ + ' -output ' + cascade_train_out, \ verbose=opt_h.verbose()) run_cmd(basic_cmd \ + opt_h.starter_train() \ + opt_h.starter_test() \ + opt_h.filter_flag() \ + output_file_type \ + ' -output ' + cascade_test_out, \ verbose=opt_h.verbose()) cascade_iter = 1 new_cascade_train_out_fname = get_output_fname(cascade_train_out) cascade_expand_last = None while True: # convert test output to trees cascade_test_out_fname = get_output_fname(cascade_test_out) cascade_expand = [] log('building corpus record from ' + cascade_test_out) for s_ind, sentence in enumerate(open(cascade_test_out_fname)): i = 0 sentence_str = [] for chunk in sentence.split(): chunk = chunk.split('_') chunk_str = [] for word in chunk: if word.startswith('=') and len(word) > 1: chunk_str.append(cascade_expand_last[s_ind][i]) else: chunk_str.append(word) i += 1 if len(chunk) == 1: sentence_str.append(chunk_str[0]) else: sentence_str.append('(' + (' '.join(chunk_str)) + ')') cascade_expand.append(sentence_str) cascade_test_eval_fname = cascade_dir + '/test-eval' eval_fh = open(cascade_test_eval_fname, 'w') for sent in cascade_expand: print >> eval_fh, '(' + (' '.join(sent)).replace(' ;', '') + ')' eval_fh.close() # evaluate test output as trees run_cmd(opt_h.eval_cmd() \ + opt_h.starter_test() \ + ' -cclpOutput ' + cascade_test_eval_fname \ + opt_h.filter_flag(), fh=results_fh, \ verbose=opt_h.verbose()) cascade_expand_last = cascade_expand log('running cascade level ' + str(cascade_iter)) # build term frequency map from last train output cascade_train_out_fname = new_cascade_train_out_fname phrasal_terms = PhrasalTerms(cascade_train_out_fname) # create next-run train next_run_train_fname = cascade_dir + '/next-train' phrasal_terms.write_new_dataset(cascade_train_out_fname, \ next_run_train_fname) # run chunker, output re-chunked train new_cascade_dir = '%s/cascade%02d' % (opt_h.output(), cascade_iter) makedirs(new_cascade_dir) cascade_train_out = '%s/train-out' % new_cascade_dir run_cmd(basic_cmd \ + ' -train ' + next_run_train_fname \ + ' -trainFileType SPL ' \ + ' -test ' + next_run_train_fname \ + ' -testFileType SPL ' \ + output_file_type \ + ' -output ' + cascade_train_out, verbose=opt_h.verbose()) # if re-chunked train is the same as orig, break new_cascade_train_out_fname = get_output_fname(cascade_train_out) if filecmp(cascade_train_out_fname, new_cascade_train_out_fname): break # create next-run test cascade_test_out = '%s/test-out' % new_cascade_dir next_run_test_fname = cascade_dir + '/next-test' phrasal_terms.write_new_dataset(cascade_test_out_fname, \ next_run_test_fname) # run the chunker, output re-chunked test run_cmd(basic_cmd \ + ' -train ' + next_run_train_fname \ + ' -trainFileType SPL ' \ + ' -test ' + next_run_test_fname \ + ' -testFileType SPL ' \ + output_file_type \ + ' -output ' + cascade_test_out, verbose=opt_h.verbose()) cascade_dir = new_cascade_dir cascade_iter += 1 results_fh.close() else: cmd = opt_h.basic_cmd() output_flag = '' if opt_h.stdout(): output_flag = ' -output -' elif opt_h.output() is not None: opt_h.check_output() output_flag = ' -output ' + opt_h.output() cmd += ' -outputType ' + opt_h.output_type() cmd += output_flag cmd += opt_h.starter_train() cmd += opt_h.starter_test() cmd += opt_h.filter_flag() cmd += ' -E PRCL -e CLUMP,NPS,TREEBANKPREC' run_cmd(cmd, verbose=opt_h.verbose())