Example #1
0
def logic_type(cell, max_api_seq_len, min_api_seq_len, key):
    if is_boilerplate(cell):
        # boilerplate may not tokenize so leave it
        return 'boilerplate'

    try:
        toks, types = tokenize_and_templatize(cell[key])
    except:
        return 'untokenizeable'

    api_seq = gen_api_seq(toks, types)
    if len(api_seq) > max_api_seq_len:
        return f'api seq longer than {max_api_seq_len}'

    if len(api_seq) < min_api_seq_len:
        return f'api seq shorter than {min_api_seq_len}'

    if 'class' in toks:
        return 'class'
    elif toks.count('def') > 1:
        return 'more than 1 function'
    elif toks.count('def') == 1:
        return '1 function'
    else:
        return 'pure logic'
Example #2
0
def convert2to3(cell):
    original_code = cell['source']
    try:
        x = ast.parse(original_code)
        y = tokenize_and_templatize(original_code)
        return cell
    except:
        pass

    # fixers = ['lib2to3.fixes.fix_print']
    # refactor = RefactoringTool(fixer_names=fixers)
    # tree = refactor.refactor_string(original_code, 'temp')
    try:

        # fixers = get_fixers_from_package('lib2to3.fixes')
        # the only python2 issue is print statements so we only import this
        # fixer since it makes RefactorTool way faster
        fixers = ['lib2to3.fixes.fix_print']
        refactor = RefactoringTool(fixer_names=fixers)
        tree = refactor.refactor_string(original_code, 'temp')
        converted_code = str(tree)
        print(converted_code)
        x = ast.parse(converted_code)
        # y = tokenize_and_templatize(converted_code)

        # print('===============\n', original_code, '\n', converted_code)
        cell['source'] = converted_code
    except Exception as e:
        print(e)
        # print('==========\n', print(original_code))
        pass

    return cell
Example #3
0
def get_imports(nb_cells, cell_index):
    # get all imports including from current cell
    imports = []
    for c in nb_cells[:cell_index + 1]:
        if is_code(c):
            for line in c['source'].splitlines():
                if (line.strip().startswith('import ')
                        # some edge cases have from but no import token
                        or
                    (line.strip().startswith('from ') and 'import' in line)):
                    # if 'import' not in line:
                    #     print(line)
                    #     raise ValueError(line)

                    # we pick after import to save on total num tokens
                    imports.append(line[line.index('import'):])
    imports_string = '\n'.join(imports)
    import_toks = []
    try:
        toks, types = tokenize_and_templatize(imports_string)

        # print(toks, types)
        for i, (tok, type) in enumerate(zip(toks, types)):
            if type in ['NAME', 'import', 'as', ',']:
                import_toks.append(tok)

    except:
        # the tokenization failed because of a multiline import using parenthesis
        import_toks = re.split('(\W+)', imports_string)
        # toks look like this: ['import', ' ', 'pyspark', '\n',] so we want to exclude
        # newlines and spaces
        import_toks = [t for t in import_toks if t.strip()]
    return import_toks
Example #4
0
def does_parse(original_code, url=None):
    assert isinstance(original_code, str)
    try:
        x = ast.parse(original_code)
        y = tokenize_and_templatize(original_code)
        return True
    except:
        return False
Example #5
0
def get_solution_new(group):
    '''We deduplicate solutions and pick most common solution.'''
    cells = [row.target_cell for _, row in group.iterrows()]

    solution2count = Counter()
    for c in cells:
        solution2count[c['code']] += 1
    submission2count = Counter()
    for c in cells:
        submission2count[c['metadata']['boilerplate']] += 1

    most_comm_sol = solution2count.most_common(1)[0][0]
    most_comm_sub = submission2count.most_common(1)[0][0]
    if (solution2count.most_common(1)[0][1] > 1
            and submission2count.most_common(1)[0][1] > 1):
        top_solution_cells = [c for c in cells if c['code'] == most_comm_sol]

        # verify from different repositories
        repos = set([c['metadata']['repo'] for c in top_solution_cells])
        if len(repos) < 2:
            return 'only from 1 repo'

        # we arbitrarily pick the first one
        target_cell = top_solution_cells[0]
        if not target_cell['code_tokens']:
            return 'empty target cell tokens'

        # Now we want to pick code blocks with an empty boilerplate.
        target_cell['boilerplate_code'] = most_comm_sub
        if most_comm_sub.strip():
            try:
                toks, types = tokenize_and_templatize(most_comm_sub)
            except:
                return 'boiler tokenization failed'
            replaced = replace_newlines_indents(toks,
                                                types,
                                                enable_assert=False,
                                                comments=True)
            replaced = [t for t in replaced if t not in ['NEWLINE', 'INDENT']]
            if len(replaced) > 10:
                # i like 10 as a threshold since some spurious submissions are actually answers
                # and hence would be a spurious boilerplate. the main motivation is to allow comments
                # which would be removed by the replaced above. unfortunately empty function
                # boilerplates also removed
                return 'boilerplate longer than 10 toks'

        target_cell['boilerplate_code_tokens'] = []

        return target_cell

    else:
        return 'only 1 solution'
Example #6
0
def handle_solution(all_cells):
    ''' extract code within begin solution tag. requirement is that its non empty
    and multiple cells have the same solution.'''

    # filter cells with empty solution, these can become boilerplate!!
    implemented_solutions =[]
    for c in all_cells:
        if c['code_tokens']:
            if get_in_between_solution_comments(c['code_tokens']):
                implemented_solutions.append(c)
    del all_cells

    # dictionary of solution to cell indices
    sol2idx = defaultdict(list)
    for i, c in enumerate(implemented_solutions):
        sol2idx[c['code']].append(i)

    # we pick from most common
    for indices in sorted(list(sol2idx.values()), key=lambda x:len(x), reverse=True):
        if len(indices) < 2:
            return 'only 1 solution'

        target_cell = implemented_solutions[indices[0]]
        start, end = get_in_between_solution_comments(target_cell['code_tokens'])

        # We tokenize again to get the types
        toks, types = tokenize_and_templatize(target_cell['code'])

        target_cell['extracted_code_tokens'] = replace_newlines_indents(toks[start:end],
                                                                        types[start:end],
                                                                        enable_assert=False, strings=True,comments=True)
        target_cell['extracted_code_types'] = types[start:end]

        # Get the boilerplate tokens surrounding the solution.
        boiler_toks = toks[:start] + toks[end:]
        boiler_types = types[:start] + types[end:]
        target_cell['boilerplate_code_tokens'] = replace_newlines_indents(boiler_toks, boiler_types,enable_assert=False, strings=True,comments=True)
        target_cell['boilerplate_code_types'] = boiler_types

        return target_cell
Example #7
0
def does_parse(original_code, url=None):
    assert isinstance(original_code, str)
    # lines with %matplotlib inline don't parse, we'll keep them for now
    # lines = []
    # for line in original_code.splitlines():
    #     if not line.strip().startswith('%'):
    #         lines.append(line)
    # code = '\n'.join(lines)

    try:
        x = ast.parse(original_code)
        y = tokenize_and_templatize(original_code)
        return True
    except:
        # todo deal with these
        '''
        what gets removed:
        - slight indentation issue
        '''

        # print('=========\n', original_code, url)
        return False
Example #8
0
def get_all_code_process(code, allow_api_declarations=False):
    '''Lots of old code tokenization. We use "code_tokens_clean" '''
    try:
        targ_tokens, types = tokenize_and_templatize(code)
        api_sequence = gen_api_seq(targ_tokens,
                                   types,
                                   allow_declarations=allow_api_declarations)
        comments, docstring = extract_comments(targ_tokens, types)

        # full code, with no extraneous newlines, and normalized strings. i plan to use this for
        # context and target code!
        clean_code = astor.to_source(ast.parse(code))
        clean_code_toks, clean_code_types = tokenize_and_templatize(clean_code)
        clean_code_toks = replace_newlines_indents(clean_code_toks,
                                                   clean_code_types,
                                                   strings=True,
                                                   enable_assert=False)

        # todo do a lstrip newlines on targ_tokens

        strip_import_or_func = take_single_function(code)
        strip_import_or_func_toks, strip_types = tokenize_and_templatize(
            strip_import_or_func)

        normalized = normalize_code(strip_import_or_func)
        normalized_tokens, norm_types = tokenize_and_templatize(normalized)
    except:
        targ_tokens = None
        clean_code_toks = None
        api_sequence = None
        normalized_tokens = None
        norm_types = None
        comments = None
        docstring = None
        strip_import_or_func_toks = None
        strip_types = None

    newline_tok = 'NEWLINE'
    if strip_import_or_func_toks:
        # here we do partial normalization. replace strings, but leave variable names.
        # this is particularly useful for context cells so we can copy initializations
        # from above
        code_tokens_with_vars_no_strings = replace_newlines_indents(
            strip_import_or_func_toks, strip_types, newline_tok, strings=True)

        code_tokens_with_vars_no_strings_trunc = truncate_code_to_lines(
            code_tokens_with_vars_no_strings, newline_tok, 10)
    else:
        code_tokens_with_vars_no_strings_trunc = None

    # we move this logic out of try since better to avoid try if possible
    if normalized_tokens:
        replaced = replace_newlines_indents(normalized_tokens, norm_types,
                                            newline_tok)
        normalized_trunc = truncate_code_to_lines(replaced, newline_tok, 10)
        if len(api_sequence) == 0:
            api_sequence = ['NO_API_SEQUENCE']
    else:
        normalized_tokens = None
        normalized_trunc = None

    js = {
        'code': code,
        'code_tokens_normalized': normalized_tokens,
        'code_tokens_normalized_trunc': normalized_trunc,
        'code_tokens_partial_normalized_trunc':
        code_tokens_with_vars_no_strings_trunc,
        'code_tokens': targ_tokens,
        'code_tokens_clean': clean_code_toks,
        'comments': comments,
        'docstring': docstring,
        'api_sequence': api_sequence,
    }
    return js
Example #9
0
def dedup_boiler_extract(group, boiler_set):
    solutions = group[group.is_boilerplate==False]
    if not solutions.empty:
        # if group.is_boilerplate.any():
        #     return 'both'

        all_cells = [row.target_cell for _, row in solutions.iterrows()]
        # Instructor answers have a different extraction process.
        for c in all_cells:
            if c['is_instructor_answer']:
                return handle_solution(all_cells)


        # Get cells which passed test cases.
        passed_cells = []
        for c in all_cells:
            if 'test_below_passed' in c['metadata'] and c['metadata']['test_below_passed']:
                # Prefer cells from output notebooks by adding to front.
                if 'from_output_nb' in c['metadata'] and c['metadata']['from_output_nb']:
                    passed_cells.insert(0, c)
                else:
                    passed_cells.append(c)

        # If no passed test cases omit examples.
        if not passed_cells:
            return 'no autograder tests or none passed'

        target_cell = passed_cells[0]

        # Retrieve boilerplate, either if its not in our dataset, we try to match
        # all the boilerplates or dataset contains with the cell.
        if group.is_boilerplate.any():
            boiler = group[group.is_boilerplate].iloc[0].target_cell['code']
        else:
            boiler = find_boilerplate_for_checksum(boiler_set, target_cell)


        if boiler:
            # Try to extract the student code from the solution, excluding boilerplate.
            extracted = diff_cell(target_cell['code'], boiler)
            if extracted:
                # boiler_js = get_all_code_process(boiler)
                # extract_js = get_all_code_process(extracted)
                target_cell['extracted_code'] = extracted
                target_cell['boilerplate_code'] = boiler

                # Try to tokenize extracted code and boilerplate.
                try:
                    toks, types = tokenize_and_templatize(extracted)
                    target_cell['extracted_code_tokens'] = replace_newlines_indents(toks, types, enable_assert=False, comments=True, strings=True)
                    target_cell['extracted_code_types'] = types

                    toks, types = tokenize_and_templatize(boiler)
                    boiler_toks = replace_newlines_indents(toks, types, enable_assert=False, comments=True, strings=True)

                    # If trivial just leave boilerplate tokens empty.
                    if is_trivial_boilerplate(boiler_toks):
                        boiler_toks = []
                        types = []
                    target_cell['boilerplate_code_tokens'] = boiler_toks
                    target_cell['boilerplate_code_types'] = types

                    return target_cell
                except:
                    return 'not tokenizable autograded'
            else:
                return 'multiple insertion points'
        else:
            return 'no boilerplate'

    return 'only boiler'