Esempio n. 1
0
def compute_dataset_record_helper(c, nb_cells, cell_index, context_len,
                                  max_tokens):
    js = {
        'execution_count': c['execution_count'],
        'cell_type': c['cell_type'],
        # 'code': c['source'],
        'metadata': c['metadata'],
        'context': []
    }
    js.update(get_all_code_process(c['source']))

    if not js['code_tokens']:
        return None
    if len(js['code_tokens_clean']) > max_tokens:
        return None

    # get cells above in reversed order
    reversed_cells_before = reversed(nb_cells[:cell_index])

    # we store number here for metrics purposes later on.
    js['num_cells_above'] = len(nb_cells[:cell_index])

    js['imports'] = get_imports_simple(nb_cells, cell_index)

    for i, c in enumerate(reversed_cells_before):
        dist = i + 1
        if dist <= context_len and is_valid_cell(c):
            new_js = {
                'cell_type': c['cell_type'],
                'distance_target': dist,
            }
            if is_markdown(c):
                try:
                    tokens = normalize_nl_leave_code_tokenize(c['source'])
                except:
                    tokens = c['source'].split()

                # nl may not be directly above, so we pick closest one in context above
                if 'nl' not in js:
                    js['nl'] = tokens

                new_js.update({'nl': tokens, 'nl_original': c['source']})
            else:
                # skip test cases since they don't provide signal for api seq generation
                # ok to leave in here for noisy train since it'll have type unknown
                if grading_type(c) == 'autograder tests':
                    continue

                j = get_all_code_process(c['source'],
                                         allow_api_declarations=True)
                new_js.update(j)

            js['context'].append(new_js)

    return js
Esempio n. 2
0
 def get_cell(row):
     '''if not nl above return high int representing infinite distance'''
     cells = row.cells
     cell_index = row.metadata_cell['cell_index']
     reversed_cells_before = reversed(cells[:cell_index])
     # iterate over all cells above starting with one directly above
     for i, c in enumerate(reversed_cells_before):
         if i+1 <= max_dist and is_markdown(c):
             return row.og_cell
     # print(row.og_cell['metadata']['nb_orig_url'])
     return {}
Esempio n. 3
0
def add_keys(c):
    '''Grouping on the nl and distance from it.'''
    if 'nl' not in c:
        if 'comments' in c and c['comments']:
            c['nl'] = c['comments']
        else:
            c['nl'] = ['yo']
    nl = ' '.join(c['nl'])
    dist = [x['distance_target'] for x in c['context'] if is_markdown(x)]
    dist = dist[0] if dist else 0

    new_js = {}
    new_js['groupbykey'] = nl + str(dist)
    new_js['target_cell'] = copy.deepcopy(c)

    return new_js
Esempio n. 4
0
def add_keys(c):
    c.update(c['metadata']['nbgrader'])
    c['target_cell'] = copy.deepcopy(c)

    if 'checksum' not in c['metadata']['nbgrader']:
        c['checksum'] = 'dummy-checksum'
    if 'points' not in c['metadata']['nbgrader']:
        c['points'] = 'dummy-checksum'
    if 'grade_id' not in c['metadata']['nbgrader']:
        c['grade_id'] = 'dummy-checksum'

    nl = ' '.join(c['nl'])
    dist = [x['distance_target'] for x in c['context'] if is_markdown(x)]
    dist = dist[0]

    # we add the checksum since the nl cells "your answer here" requesting a manually
    # entered nl will cause a lot of false collisions. We factor distance since same
    # nl could be used for multiple targets or if they occur consecutively under the
    # same nl.
    c['groupbykey'] = nl + str(dist) + c['checksum']
    return c
Esempio n. 5
0
def get_markdown_language(nb):
    def get_language(string):
        logging.getLogger("polyglot").setLevel(logging.CRITICAL)
        try:
            d = Detector(string, quiet=True)
            if d.reliable:
                return d.language.name
            else:
                return d.languages[0].name
        except:
            # detector breaks on weird ascii chars, seems like
            # they come from english
            return 'failed'

    nl = ""
    for cell in nb['cells']:
        if is_markdown(cell) and 'source' in cell and cell['source']:
            nl += cell['source'] + ' '
    lang = get_language(nl)
    # if lang == 'Italian':
    #     print('==========\n', nl)
    #     pass
    return lang
Esempio n. 6
0
def process_dump_get_cells(nb, nb_vizdir, write_cells=False):
    '''
    add some metadata to cells, dump for vizualization
    for the training set write_cells should be False
        - this saves memory as the notebook is written once
        - link to notebook added to cell

    for nbgrader dev set we write notebook with each record:
        - this way the extension can scroll down to the cell and save time
    :param nb:
    :param nb_vizdir:
    :param write_cells:
    :param filter_code_tags: for noisytrain take cells with nl above having code tags
    :param filter_docstring: for noisy train take cells with a nice function docstring
    :return:
    '''

    nb_to_dump = copy.deepcopy(nb)

    # cell quality control here, non null source, cell type, etc.
    cells = nb['cells']
    nb_index = nb['metadata']['nb_index']

    for cell_index, c in enumerate(cells):
        if 'metadata' not in c or isinstance(c['metadata'], list):
            c['metadata'] = {}
        c['metadata']['nb_index'] = nb_index
        c['metadata']['cell_index'] = cell_index
        c['metadata']['repo'] = nb['metadata']['repo']
        c['metadata']['path'] = nb['metadata']['path']

    # cells = [c for c in cells if is_valid_cell(c)]
    cells = [standardize_cell(c) for c in cells]

    if not write_cells:
        new_cells = []
        # we keep only code cells with markdown that has code tag. we keep
        # all markdown cells.
        for i, c in enumerate(cells):
            if is_code(c) and i:
                cell_above = cells[i - 1]
                if is_markdown(cell_above):
                    if is_ascii(cell_above['source']):
                        # confirm that ascii conversion works
                        new_cells.append(c)
                    else:
                        # just remove an entire notebook if it has unicode!
                        # otherwise context will have unicode characters.
                        return []
        del cells

        # this is for the noisy train
        # dump the whole notebook once to save memory
        file_path = abspath(nb_vizdir + f'/nb_{nb_index}.ipynb')

        for c in new_cells:
            c['metadata']['nb_orig_url'] = get_url(file_path)

        # dump the whole notebook to see the ctx with cell
        with open(file_path, 'w') as outfile:
            json.dump(nb_to_dump, outfile)

        return new_cells
    else:

        found_test = False
        test_passed = False
        from_output_nb = False
        # we go in reverse since the tests come after the autograded code block
        for c in reversed(cells):
            if grading_type(c) == 'autograder tests':
                found_test = True

                # later outputs will be removed so important to capture this
                if c['outputs']:
                    from_output_nb = True

                # if the test has an error or was never executed, we decide that the tests didn't pass
                if ((c['outputs'] and 'output_type' in c['outputs'][0]
                     and c['outputs'][0]['output_type'] == 'error')
                        or c['execution_count'] == 0):
                    test_passed = False
                else:
                    test_passed = True
            elif grading_type(c) in ['autograded code', 'manual graded code']:
                # we'll prefer records that come from a notebook with outputs
                c['metadata']['from_output_nb'] = from_output_nb
                if found_test:
                    c['metadata']['test_below_passed'] = test_passed

        # this is for dev/test set where we take only nbgrader cells
        nbgrader_cells = []
        for cell_index, c in enumerate(cells):
            if 'nbgrader' in c['metadata']:
                # we write the title in this format so the extension will
                # scroll to this c
                file_path = abspath(nb_vizdir +
                                    f'/nb_{nb_index}_cell_{cell_index}.ipynb')

                # add nb and location to c for later visualization
                c['metadata']['nb_orig_url'] = get_url(file_path)

                # dump the whole notebook to see the ctx with c
                with open(file_path, 'w') as outfile:
                    json.dump(nb_to_dump, outfile)

                nbgrader_cells.append(c)

        return nbgrader_cells
Esempio n. 7
0
def markdown_ratio(nb):
    markdowns = [1 if is_markdown(c) else 0 for c in nb['cells']]
    if len(markdowns):
        return sum(markdowns) / len(markdowns)
    return 0
Esempio n. 8
0
 def filter_func(nb):
     for c in nb['cells']:
         if is_markdown(c) and is_code_tag_in_nl(c['source']):
             return True
     return False