def main(): parser = argparse.ArgumentParser('Change python code to PythonF_ck') parser.add_argument('file', help='File to process') parser.add_argument('output', help='File to save output', default='output0.py') class Opt: tabs = True args = parser.parse_args(namespace=Opt) if not os.path.isfile(args.file): print(f'Input file not found: {args.file}', file=sys.stderr) return if not os.path.isfile(args.output): print(f'Output file not found: {args.output}', file=sys.stderr) source = open(args.file).read() tokens = token_utils.listified_tokenizer(source) source = minification.minify(tokens, args) hist = histogram(source) result, numbers = init_numbers(hist) result += make_text(source, numbers) result += 'exec(x)' open(args.output, 'w+').write(result) print('Done.')
def py_minify(code): """ minifies a string (of python code) passed see: https://liftoff.github.io/pyminifier/_modules/pyminifier/minification.html#minify """ tokenized = py_tokenizer.listified_tokenizer(code) options = PyminifierOptions() minified = py_minifier.minify(tokenized, options) return minified
def minify_script(patches=None, keep_report=True, show_diff=False): """minifies createstubs.py Args: patches ([PathLike], optional): List of paths to patches to apply. Defaults to None. keep_report (bool, optional): Keeps single report line in createstubs Defautls to True. show_diff (bool, optional): Print diff from edits. Defaults to False. Returns: str: minified source text """ patches = patches or [] edits = [ ("comment", "print"), ("comment", "import logging"), ("comment", "self._log ="), ("comment", "self._log.debug"), ("comment", "self._log.warning"), ("comment", "self._log.info"), ("comment", "self._log.error"), ] if keep_report: report = ('rprint', ('self._log.info("Stub module: {:<20} to file:' ' {:<55} mem:{:>5}".' 'format(module_name, file_name, m1))')) clean = ( 'rprint', 'self._log.info("Clean/remove files in folder: {}".format(path))') edits.insert(0, report) edits.insert(1, clean) minopts = Values({'tabs': False}) with SCRIPT.open('r') as f: content = f.read() for path in patches: path = Path(path) content = apply_patch(content, path.read_text()) content = edit_lines(content, edits, show_diff=show_diff) tokens = token_utils.listified_tokenizer(content) source = minification.minify(tokens, minopts) return source
def minify_code(file_path): """ Args: file_path: Returns: """ # Open the file and read it's content. with open(file_path, 'r') as f: source = f.read() # Get tokens from file. tokens = token_utils.listified_tokenizer(source) # Minify the file content based on the tokens minified = minification.minify(tokens, PyminiferOptions()) # Recompute tokens from minified version. tokens = token_utils.listified_tokenizer(minified) # Final result on file minified. result = token_utils.untokenize(tokens) return result
minify = False if len(sys.argv) >= 3 and sys.argv[2] == "--minify": minify = True output_base = os.path.splitext(sys.argv[1])[0] output_scad = output_base + ".scad" with open(sys.argv[1]) as f: text = f.read() if minify: from pyminifier import minification, token_utils tokens = token_utils.listified_tokenizer(text) Options = namedtuple("Options", ["tabs"]) text = minification.minify(tokens, Options(tabs=False)) text = text.rstrip() with open(output_base + ".min.py", "w") as f: f.write(text) else: # We'll be nice and cut off trailing whitespace before the count (Because a lot of editors will add a newline) text = text.rstrip() size = len(text.encode("utf-8")) code = """ import math import random from solid import * from solid.utils import * {}
def main(args): # Open input and output files csv_file = open(args.input_file, 'r') json_file = open(args.output_file, 'w') # Check if user wants to customize csv field order if not args.field_order_file: field_names = ["ScriptProjectId","ScriptVersionId","AuthorUserId","UserDisplayName","CompetitionId","CompetitionName","ScriptTitle","ScriptContent"] else: with open(args.field_order_file, 'r') as _order_file: field_order_reader = csv.reader(order_file) for row in field_order_reader: field_names = row continue # Address csv error regarding fields that exceed default size limit # Adapted from Stack Overflow post by user1251007 maxInt = sys.maxsize decrement = True while decrement: # decrease the maxInt value by factor 10 # as long as the OverflowError occurs. decrement = False try: csv.field_size_limit(maxInt) except OverflowError: maxInt = int(maxInt/10) decrement = True # Read CSV and Write the JSON to output_file after doing some preprocessing reader = csv.DictReader(csv_file, field_names) files = 0 parsed_competitions = defaultdict(list) logger.info("Processing csv file...") for row in reader: files+=1 # Remove very short scripts based on command line arguments script_len = len(separate_code_and_comments(row['ScriptContent'],row['ScriptTitle'])[0]) if script_len<args.min_script_len: continue # Remove meta kaggle scripts labeled as python that are probably R if row['ScriptContent'].find("<-")!=-1 and row['ScriptContent'].find("library(")!=-1: continue # Remove Kaggle competition name from the script content to allow model testing on competitions if 'CompetitionName' in row and 'ScriptContent' in row: row['ScriptContent'].replace(row['CompetitionName']," ") row['ScriptContent'].replace(row['CompetitionName'].lower(), " ") parsed_competitions[row['CompetitionId']].append(row) submissions_deduped = list() logger.info("Removing duplicates...") # Iterate over competitions to remove duplicates and near duplicates for competition in parsed_competitions: counter = 0 submissions = parsed_competitions[competition] # Pair-wise SequenceMatcher comparison of ScriptContent for i in range(len(submissions)): for j in range(len(submissions)): if i!=j and SequenceMatcher(None, submissions[i]['ScriptContent'].lower(), \ submissions[j]['ScriptContent'].lower()).ratio() > args.duplicate_threshold: submissions[i]['ScriptContent'] = "" counter+=1 break remove_empties = [x for x in submissions if x['ScriptContent']!=""] logger.info("%d duplicates removed from %d submissions in competition %s" % (counter,len(submissions),competition)) # Ensure competition has at least ten entries for future comparison if len(remove_empties)>=10: for item in remove_empties: submissions_deduped.append(item) else: logger.warning("Competition %s has too few remaining submissions at threshold %f" % (competition,args.duplicate_threshold)) # Build a custom namedtuple to integrate into pyminifer argparse command line methods if args.minimize or args.obfuscate: options_tuple = namedtuple("options_tuple", ["tabs", "minimize", "obfuscate", "replacement_length"]) options = options_tuple(False, args.minimize, args.obfuscate, 1) errors = 0 written = 0 for row in submissions_deduped: # Minimize size of python script if set in args if args.minimize or args.obfuscate: try: tokens = token_utils.listified_tokenizer(row['ScriptContent']) source = minification.minify(tokens,options) tokens = token_utils.listified_tokenizer(source) # Obsfuscate python script if args.obfuscate: table = [{}] module = row['ScriptTitle'] name_generator = obfuscate.obfuscation_machine(identifier_length=int(options.replacement_length)) obfuscate.obfuscate(module, tokens, options, name_generator=name_generator, table=table) # Convert back to text result = '' result += token_utils.untokenize(tokens) row['ScriptContent'] = result except Exception as e: # logger.info("%s in %s; continuing" % (e.__class__.__name__,row['ScriptTitle'])) errors+=1 continue written+=1 json.dump(row, json_file) json_file.write('\n') logger.info("Total files reviewed: %d" % files) if args.minimize or args.obfuscate: logger.info("File that failed pyminifier minimization/obfuscation parsing: %d" % errors) logger.info("Files successfully parsed to json: %d" % written) csv_file.close() json_file.close()