Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser('Change python code to PythonF_ck')
    parser.add_argument('file', help='File to process')
    parser.add_argument('output',
                        help='File to save output',
                        default='output0.py')

    class Opt:
        tabs = True

    args = parser.parse_args(namespace=Opt)
    if not os.path.isfile(args.file):
        print(f'Input file not found: {args.file}', file=sys.stderr)
        return
    if not os.path.isfile(args.output):
        print(f'Output file not found: {args.output}', file=sys.stderr)
    source = open(args.file).read()
    tokens = token_utils.listified_tokenizer(source)
    source = minification.minify(tokens, args)
    hist = histogram(source)
    result, numbers = init_numbers(hist)
    result += make_text(source, numbers)
    result += 'exec(x)'

    open(args.output, 'w+').write(result)
    print('Done.')
Ejemplo n.º 2
0
def py_minify(code):
    """
    minifies a string (of python code) passed
    see: https://liftoff.github.io/pyminifier/_modules/pyminifier/minification.html#minify
    """
    tokenized = py_tokenizer.listified_tokenizer(code)

    options = PyminifierOptions()
    minified = py_minifier.minify(tokenized, options)
    return minified
Ejemplo n.º 3
0
def insert_in_next_line(tokens, index, string):
    """
    Inserts the given string after the next newline inside tokens starting at
    *tokens[index]*.  Indents must be a list of indentation tokens that will
    preceeed the insert (can be an empty list).
    """
    tokenized_string = token_utils.listified_tokenizer(string)
    for i, tok in list(enumerate(tokens[index:])):
        token_type = tok[0]
        if token_type in [tokenize.NL, tokenize.NEWLINE]:
            for count, item in enumerate(tokenized_string):
                tokens.insert(index+count+i+1, item)
            break
Ejemplo n.º 4
0
def minify_code(file_path):
    """

    Args:
        file_path:

    Returns:

    """
    # Open the file and read it's content.
    with open(file_path, 'r') as f:
        source = f.read()

    # Get tokens from file.
    tokens = token_utils.listified_tokenizer(source)
    # Minify the file content based on the tokens
    minified = minification.minify(tokens, PyminiferOptions())
    # Recompute tokens from minified version.
    tokens = token_utils.listified_tokenizer(minified)
    # Final result on file minified.
    result = token_utils.untokenize(tokens)

    return result
Ejemplo n.º 5
0
def minify_script(patches=None, keep_report=True, show_diff=False):
    """minifies createstubs.py

    Args:
        patches ([PathLike], optional): List of paths to patches to apply.
            Defaults to None.
        keep_report (bool, optional): Keeps single report line in createstubs
            Defautls to True.
        show_diff (bool, optional): Print diff from edits. Defaults to False.

    Returns:
        str: minified source text
    """
    patches = patches or []
    edits = [
        ("comment", "print"),
        ("comment", "import logging"),
        ("comment", "self._log ="),
        ("comment", "self._log.debug"),
        ("comment", "self._log.warning"),
        ("comment", "self._log.info"),
        ("comment", "self._log.error"),
    ]
    if keep_report:
        report = ('rprint', ('self._log.info("Stub module: {:<20} to file:'
                             ' {:<55} mem:{:>5}".'
                             'format(module_name, file_name, m1))'))
        clean = (
            'rprint',
            'self._log.info("Clean/remove files in folder: {}".format(path))')
        edits.insert(0, report)
        edits.insert(1, clean)

    minopts = Values({'tabs': False})
    with SCRIPT.open('r') as f:
        content = f.read()
        for path in patches:
            path = Path(path)
            content = apply_patch(content, path.read_text())
        content = edit_lines(content, edits, show_diff=show_diff)
        tokens = token_utils.listified_tokenizer(content)
        source = minification.minify(tokens, minopts)
    return source
Ejemplo n.º 6
0
def apply_obfuscation(source, module):
    """
    Returns 'source' all obfuscated.
    """
    global keyword_args
    global imported_modules
    tokens = token_utils.listified_tokenizer(source)
    keyword_args = analyze.enumerate_keyword_args(tokens)
    imported_modules = analyze.enumerate_imports(tokens)
    variables = find_obfuscatables(tokens, obfuscatable_variable, ignore_length=True)
    classes = find_obfuscatables(tokens, obfuscatable_class)
    functions = find_obfuscatables(tokens, obfuscatable_function)
    for variable in variables:
        replace_obfuscatables(
            module, tokens, obfuscate_variable, variable, name_generator)
    for function in functions:
        replace_obfuscatables(
            module, tokens, obfuscate_function, function, name_generator)
    for _class in classes:
        replace_obfuscatables(module, tokens, obfuscate_class, _class, name_generator)
    return token_utils.untokenize(tokens)
Ejemplo n.º 7
0
from solid.utils import *

minify = False
if len(sys.argv) >= 3 and sys.argv[2] == "--minify":
    minify = True

output_base = os.path.splitext(sys.argv[1])[0]
output_scad = output_base + ".scad"

with open(sys.argv[1]) as f:
    text = f.read()

if minify:
    from pyminifier import minification, token_utils

    tokens = token_utils.listified_tokenizer(text)
    Options = namedtuple("Options", ["tabs"])
    text = minification.minify(tokens, Options(tabs=False))
    text = text.rstrip()
    with open(output_base + ".min.py", "w") as f:
        f.write(text)
else:
    # We'll be nice and cut off trailing whitespace before the count (Because a lot of editors will add a newline)
    text = text.rstrip()

size = len(text.encode("utf-8"))

code = """
import math
import random
from solid import *
Ejemplo n.º 8
0
def main(args):

    # Open input and output files
    csv_file = open(args.input_file, 'r')
    json_file = open(args.output_file, 'w')

    # Check if user wants to customize csv field order
    if not args.field_order_file:
        field_names = ["ScriptProjectId","ScriptVersionId","AuthorUserId","UserDisplayName","CompetitionId","CompetitionName","ScriptTitle","ScriptContent"]
    else:
        with open(args.field_order_file, 'r') as _order_file:
            field_order_reader = csv.reader(order_file)
            for row in field_order_reader:
                field_names = row
                continue

    # Address csv error regarding fields that exceed default size limit
    # Adapted from Stack Overflow post by user1251007
    maxInt = sys.maxsize
    decrement = True

    while decrement:
        # decrease the maxInt value by factor 10
        # as long as the OverflowError occurs.

        decrement = False
        try:
            csv.field_size_limit(maxInt)
        except OverflowError:
            maxInt = int(maxInt/10)
            decrement = True

    # Read CSV and Write the JSON to output_file after doing some preprocessing
    reader = csv.DictReader(csv_file, field_names)

    files = 0
    parsed_competitions = defaultdict(list)
    logger.info("Processing csv file...")

    for row in reader:
        files+=1
        # Remove very short scripts based on command line arguments
        script_len = len(separate_code_and_comments(row['ScriptContent'],row['ScriptTitle'])[0])
        if script_len<args.min_script_len:
            continue
        # Remove meta kaggle scripts labeled as python that are probably R
        if row['ScriptContent'].find("<-")!=-1 and row['ScriptContent'].find("library(")!=-1:
            continue
        # Remove Kaggle competition name from the script content to allow model testing on competitions
        if 'CompetitionName' in row and 'ScriptContent' in row:
            row['ScriptContent'].replace(row['CompetitionName']," ")
            row['ScriptContent'].replace(row['CompetitionName'].lower(), " ")

        parsed_competitions[row['CompetitionId']].append(row)

    submissions_deduped = list()
    logger.info("Removing duplicates...")

    # Iterate over competitions to remove duplicates and near duplicates
    for competition in parsed_competitions:
        counter = 0
        submissions = parsed_competitions[competition]

        # Pair-wise SequenceMatcher comparison of ScriptContent
        for i in range(len(submissions)):
            for j in range(len(submissions)):
                if i!=j and SequenceMatcher(None, submissions[i]['ScriptContent'].lower(), \
                        submissions[j]['ScriptContent'].lower()).ratio() > args.duplicate_threshold:
                    submissions[i]['ScriptContent'] = ""
                    counter+=1
                    break
        remove_empties = [x for x in submissions if x['ScriptContent']!=""]
        logger.info("%d duplicates removed from %d submissions in competition %s" % (counter,len(submissions),competition))
        
        # Ensure competition has at least ten entries for future comparison
        if len(remove_empties)>=10:
            for item in remove_empties:
                submissions_deduped.append(item)
        else:
            logger.warning("Competition %s has too few remaining submissions at threshold %f" % (competition,args.duplicate_threshold))

    # Build a custom namedtuple to integrate into pyminifer argparse command line methods
    if args.minimize or args.obfuscate:
        options_tuple = namedtuple("options_tuple", ["tabs", "minimize", "obfuscate", "replacement_length"])
        options = options_tuple(False, args.minimize, args.obfuscate, 1)


    errors = 0
    written = 0

    for row in submissions_deduped:

        # Minimize size of python script if set in args
        if args.minimize or args.obfuscate:
            try:
                tokens = token_utils.listified_tokenizer(row['ScriptContent'])
                source = minification.minify(tokens,options)
                tokens = token_utils.listified_tokenizer(source)

                # Obsfuscate python script
                if args.obfuscate:
                    table = [{}]
                    module = row['ScriptTitle']
                    name_generator = obfuscate.obfuscation_machine(identifier_length=int(options.replacement_length))
                    obfuscate.obfuscate(module, tokens, options, name_generator=name_generator, table=table)

                # Convert back to text
                result = ''
                result += token_utils.untokenize(tokens)
                row['ScriptContent'] = result

            except Exception as e:
                # logger.info("%s in %s; continuing" % (e.__class__.__name__,row['ScriptTitle']))
                errors+=1
                continue

        written+=1
        json.dump(row, json_file)
        json_file.write('\n')

    logger.info("Total files reviewed: %d" % files)
    if args.minimize or args.obfuscate:
        logger.info("File that failed pyminifier minimization/obfuscation parsing: %d" % errors)
    logger.info("Files successfully parsed to json: %d" % written)
    csv_file.close()
    json_file.close()