Example #1
0
def non_executed_prop(nb_id):

    # get number of code cells and non-executed code cells
    num_code_cells = len(data.get_code_cells(nb_id))
    num_non_executed = count_non_exec(nb_id)

    # calculate proportion
    return float(num_non_executed) / float(num_code_cells)
Example #2
0
def has_export(nb_id):

    # get code cells
    code_cells = data.get_code_cells(nb_id)

    # check each code cell for an export
    for cell in code_cells:
        if exports(cell):
            return True

    return False
Example #3
0
def has_param_import(nb_id):

    # get the code cells
    code_cells = data.get_code_cells(nb_id)

    # look for papermill import
    for cell in code_cells:
        if has_import(cell, 'papermill') or \
            has_import(cell, 'parameterized'):
            return True

    return False
Example #4
0
def num_functions(nb_id):

    # get code cells
    code_cells = data.get_code_cells(nb_id)

    # count all the functions for each cell
    num_defs = 0
    for cell in code_cells:
        num_defs += def_in_cell(cell)

    # return the total
    return num_defs
Example #5
0
def has_testing(nb_id):

    # get the code cells
    code_cells = data.get_code_cells(nb_id)

    # look for testing imports
    for cell in code_cells:
        if has_import(cell, 'pytest') or \
            has_import(cell, 'test') or \
            has_import(cell, 'unittest'):
            return True

    return False
Example #6
0
def count_images(nb_id):

    # get code cells
    output_cells = data.get_code_cells(nb_id)

    # for each code cell, checks the outputs if they have an image
    image_outputs = 0
    has_image = False
    for cell in output_cells:

        # check that the outputs field is present, move to next cell if not
        if 'outputs' not in cell.keys():
            continue

        for output in cell['outputs']:

            # check if output has output type
            if 'output_type' not in output.keys():
                continue

            # field associated with displaying an image
            if output['output_type'] == "display_data":

                # double-check that an image is actually being displayed
                keys = output.keys()
                if "png" in keys:
                    image_outputs += 1
                    has_image = True

                elif "data" in keys:
                    if "image/png" in output['data'].keys():
                        image_outputs += 1
                        has_image = True

            # fields associated with displaying a table
            elif 'data' in output.keys(
            ) and 'text/html' in output['data'].keys():

                # double-check that a table is actually being displayed
                for line in output['data']['text/html']:
                    if "</table>" in line:
                        image_outputs += 1
                        has_image = True
                        break

            # if image already found, stop checking this output cell
            if has_image:
                has_image = False
                break

    return image_outputs
Example #7
0
def output_cells(nb_id):

    # get code cells
    code_cells = data.get_code_cells(nb_id)

    # filter and get the length of the filtered list
    def condition(cell):
        try:
            return len(cell['outputs']) > 0
        except:
            return False

    output_cells = list(filter(condition, code_cells))
    return len(output_cells)
Example #8
0
def has_param(nb_id):

    # checks for imports
    if has_param_import(nb_id):
        return True

    # get code cells
    code_cells = data.get_code_cells(nb_id)

    # check for manual parameterization in the first five code cells
    for cell in code_cells[:5]:
        if is_param_cell(cell):
            return True

    return False
Example #9
0
def forwards_prop(nb_id):

    # get code cells that have been executed
    code_cells = data.get_code_cells(nb_id)
    ex_code_cells = list(filter(lambda cell: get_exec(cell) > 0, code_cells))

    # if no (or only one) code cells have been executed, return immediately
    if len(ex_code_cells) <= 1:
        return None

    # get number of backwards steps and number of steps
    back_steps = count_forwards(nb_id)
    steps = len(ex_code_cells) - 1

    return float(back_steps) / float(steps)
Example #10
0
def count_forwards(nb_id):

    # get code cells that have been executed
    code_cells = data.get_code_cells(nb_id)
    ex_code_cells = list(filter(lambda cell: get_exec(cell) > 0, code_cells))

    # iterate through and count the number of times execution order goes backwards
    backsteps = 0
    for (i, cell) in enumerate(ex_code_cells):

        # if not on the last cell, check the next cell
        if i != len(ex_code_cells) - 1:

            # count if execution order goes forwards
            if get_exec(cell) < get_exec(ex_code_cells[i + 1]):
                backsteps += 1

    return backsteps
Example #11
0
def has_error(nb_id):

    # get code cells
    code_cells = data.get_code_cells(nb_id)

    # filter down to those that have been executed
    ex_code_cells = list(filter(lambda cell: get_exec(cell) > 0, code_cells))

    # iterate through and check outputs
    for cell in ex_code_cells:

        # check outputs
        if 'outputs' in cell.keys():
            for output in cell['outputs']:
                if 'output_type' in output.keys() and \
                    (output['output_type'] == "error" or output['output_type'] == "pyerr"):
                    return True

    return False
Example #12
0
def ex_skip_average(nb_id):

    # get code cells that have been executed
    code_cells = data.get_code_cells(nb_id)
    ex_code_cells = list(filter(lambda cell: get_exec(cell) > 0, code_cells))

    # if no (or only one) code cells have been executed, return immediately
    if len(ex_code_cells) <= 1:
        return None

    # get sum of skips in execution order
    sum_skips = 0
    for (i, cell) in enumerate(ex_code_cells):

        # if not on the last cell get the size of the skip
        if i != len(ex_code_cells) - 1:
            sum_skips += abs(get_exec(ex_code_cells[i + 1]) - get_exec(cell))

    # calculate the average size of a skip
    return float(sum_skips) / float(len(ex_code_cells) - 1)
def get_language(nb_id):

    # get the notebook file
    nb = data.get_nb(nb_id)

    # look for the language
    language = None
    keys = nb.keys()

    # check if language is stored in the cells or notebook metadata
    if 'worksheets' in keys:

        # then language data is in each cell, get code cells and get the language from one of them
        code_cells = data.get_code_cells(nb_id)
        for cell in code_cells:
            if language != None:
                break
            else:
                language = cell['language']

    elif 'kernelspec' in nb['metadata'].keys():

        # then language data is in the metadata
        kernelspec = nb['metadata']['kernelspec']
        keys = kernelspec.keys()

        if 'language' in keys:
            language = kernelspec['language']
        elif 'name' in keys:
            language = kernelspec['name']
        else:
            language = None

    else:

        # language data not recorded
        language = None

    return language
Example #14
0
def count_non_exec(nb_id):

    # get code cells
    code_cells = data.get_code_cells(nb_id)

    # filter down to those that have a non-empty source
    def condition(cell):
        keys = cell.keys()
        field = ""
        if 'input' in keys:
            field = 'input'
        elif 'source' in keys:
            field = 'source'

        return len(cell[field]) > 0

    non_empty_code_cells = list(filter(condition, code_cells))

    # filter down to those that have 0 execution count
    non_executed_code_cells = list(
        filter(lambda cell: get_exec(cell) == 0, non_empty_code_cells))

    # return the length of the filtered list
    return len(non_executed_code_cells)
Example #15
0
def output_cell_prop(nb_id):

    num_code_cells = len(data.get_code_cells(nb_id))
    num_output_cells = output_cells(nb_id)

    return float(num_output_cells) / float(num_code_cells)
        repo_id = notebook['repo_id']

        # try generating the row of data
        row['nb_id'] = error_row['nb_id'] = nb_id
        row['repo_id'] = error_row['repo_id'] = repo_id 

        # check if notebook has been filtered
        if notebook['filtered']:
            error_row['err_in'] = 'filtered out'
            error_writer.writerow(error_row)
            continue 

        # check code cells may error if notebook file is empty
        try:
            # skip if there aren;t any code cells
            if len(data.get_code_cells(nb_id)) == 0:
                print(colored(identifier + ' has no code', 'yellow'))
                error_row['err_in'] = 'no code'
                error_writer.writerow(error_row)
                continue 
        except:
            print(colored("nb file error in " + identifier, 'red'))
            error_row['err_in'] = 'nb file' 
            error_writer.writerow(error_row)
            continue 
    
        # check the api response
        try:
            repo_full_name = data.get_repo_metadata(nb_id)['full_name']
        except:
            print(colored("api error in " + identifier, 'red'))